## 一、api调用模型处理数据

In [None]:
import pandas as pd

# 读取xlsx文件
df = pd.read_excel('猎聘网.xlsx')

# 将数据帧保存为csv文件
df.to_csv('猎聘网.csv', index=False)


In [None]:
import os


# 设置文件夹路径
folder_path = '.'

# 获取所有CSV文件的列表
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# 初始化一个空的DataFrame来存储合并后的数据
merged_data = pd.DataFrame()

# 遍历所有CSV文件
for file in csv_files:
    # 读取CSV文件
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    
    # 只在第一个文件时保留表头
    if file == csv_files[0]:
        merged_data = df
    else:
        # 合并数据，不保留表头
        merged_data = pd.concat([merged_data, df.iloc[:, 1:]], ignore_index=True)

# 保存合并后的数据到新的CSV文件
merged_data.to_csv('merged_file.csv', index=False)


In [None]:
import requests

def call_large_model_api(user_prompt, user_input):
    url = 'https://u21829-bb5a-4d812fe5.westc.gpuhub.com:8443/v1/chat/completions'  # 请替换为实际的API URL
    headers = {
        'Content-Type': 'application/json',
        # 请替换为实际的API密钥
    }
    data = {
        "model": "glm4",
        "stream": False,
        "temperature": 0.01,
        "max_tokens": 1024,
        "repetition_penalty": 10,
        "top_p": 0.8,
        "do_sample": True,
        "messages": [
            {
                "role": "system",
                "content": user_prompt
            },
            {
                "role": "user",
                "content": user_input
            }
        ]
    }
    
    response = requests.post(url, json=data, headers=headers)
    
    return response


In [None]:

import concurrent.futures
import csv
from tqdm import tqdm
import requests

# 定义一个函数来处理每一行数据
def process_row(row):
    # 这里是处理行的逻辑
    # 例如，您可以提取入职要求和工作内容，并返回更新后的行
    
    try:
        # 调用大模型API的逻辑（这里需要您自己实现call_large_model_api函数）
        # prompt_1 = "You are a Chinese wise man, always reply in simplified Chinese, not English, otherwise I will be very angry. Extract from the job description I gave you the job requirements or job requirements or what appears to be requirements. Rule: \n- Extract from the job description, do not regenerate. \n- Return format: 工作要求: \n1. \n2.... \n - Always reply Simplified Chinese, not English, otherwise I will be very angry."
        # response = call_large_model_api(prompt_1, row[12])
        # skill = response.json()
        # skills = skill["choices"][0]["message"]["content"]
        
        # prompt_2 = "You are a Chinese wise man, always reply in simplified Chinese, not English, otherwise I will be very angry.Extract the job description from the job description I gave you, not the entry requirement. Rule: \n- Extract from the job description, do not regenerate. \n- Return format: 工作要求: \n1. \n2.... \n - Always reply Simplified Chinese, not English, otherwise I will be very angry."
        # content = call_large_model_api(prompt_2, row[12])
        # content = content.json()
        # content = content["choices"][0]["message"]["content"]
        prompt_3 = "从任职要求提取技术栈\n- 返回格式: 技术栈: \n1. \n2.... \n - Always reply Simplified Chinese, not English, otherwise I will be very angry."
        response = call_large_model_api(prompt_3, row[18])
        skill = response.json()
        skills = skill["choices"][0]["message"]["content"]
        
    except Exception as e:
        print(f"Error processing row: {e}")
        skills = ""
        # content = ""
    
    row.append(skills)
    # row.append(content)
    return row

# 主函数
def main():
    with open('change/Boss直聘--修_1.csv', 'r', newline='') as csvfile, \
         open('change/s2.csv', 'w', newline='') as output_file:

        reader = csv.reader(csvfile)
        writer = csv.writer(output_file)
        
        # 写入标题行
        writer.writerow(['职位名称', '工作地址','学历要求', '工作年限要求','招聘人数','薪资待遇','公司行业','公司性质','公司规模','融资阶段','招聘状态','职位类型','岗位描述','公司介绍','公司工商信息','简历详情页地址','更新日期','工作内容','入职要求','技术栈'])
        next(reader)  # 跳过标题行
        
        # 初始化一个列表来存储所有的行，以便并行处理
        rows = []
        for row in reader:
            rows.append(row)
        
        # 使用concurrent.futures并行处理数据
        with concurrent.futures.ProcessPoolExecutor() as executor:
            # 提交所有任务并获取future对象列表
            futures = [executor.submit(process_row, row) for row in rows]
            
            # 初始化计数器和批处理列表
            count = 0
            batch = []
            
            # 遍历future对象，获取结果并写入文件
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing rows", unit="row"):
                # 结果处理
                processed_row = future.result()
                batch.append(processed_row)
                count += 1
                
                # 每100行，写入文件并清空批处理列表
                if count % 100 == 0:
                    writer.writerows(batch)
                    batch.clear()

# 运行主函数
if __name__ == "__main__":
    main()

## 二、整合相似工作岗位

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'Boss直聘/Boss直聘_skills.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to understand its structure



In [None]:
import re

# Function to clean and normalize job titles
def clean_job_title(title):
    # Remove special characters and extra spaces
    title = re.sub(r'[^\w\s.#]', '', title).strip()
    # Replace multiple spaces with a single space
    title = re.sub(r'\s+', ' ', title)
    # Convert to lowercase
    title = title.lower()
    return title

# Apply the function to the '职位名称' column
data['cleaned_job_title'] = data['职位名称'].apply(clean_job_title)

# Display the first few rows of the DataFrame with the cleaned job titles
data[['职位名称', 'cleaned_job_title']].head()


In [None]:
# Group by the cleaned job title and summarize the data
job_summary = data.groupby('cleaned_job_title').agg({
    '工作地址': 'nunique',  # Number of unique work locations as a proxy for demand
    '招聘人数': 'sum',      # Sum of recruitment numbers
}).reset_index()

# Rename columns for clarity
job_summary.rename(columns={'工作地址': 'DemandLocations', '招聘人数': 'TotalRecruitment'}, inplace=True)

# Display the job summary
job_summary


In [None]:
# Function to extract and separate job responsibilities using simple text processing
def extract_responsibilities_simple(description):
    # Split the description using spaces and special characters as delimiters
    responsibilities = re.split(r'\s+|\n|；|，', description)
    
    # Filter out empty strings and remove numbers and dots from the beginning of each responsibility
    responsibilities = [re.sub(r'^\d+\.?', '', resp).strip() for resp in responsibilities if resp]
    
    return responsibilities

# Apply the simple text processing function to the '岗位描述' column and store the results in a new column
data['responsibilities'] = data['岗位描述'].apply(extract_responsibilities_simple)

# Display the first few rows of the DataFrame with the extracted responsibilities
data[['岗位描述', 'responsibilities']].head()


## 同名直接合并
忽略大小写

In [None]:
df['cluster'] = df['职位名称'].str.lower().astype('category').cat.codes + 1

# Sort the DataFrame by the 'cluster' column
sorted_df = df.sort_values(by='cluster')

# Save the sorted DataFrame to a new CSV file
# sorted_file_path = 'sorted_out.csv'
# sorted_df.to_csv(sorted_file_path, index=False)



## 去除修饰词
例如工程师，开发，老师等等

In [None]:
# 重新进行岗位分类，考虑更具体的关键词来区分不同的开发领域

# 定义关键词列表，用于区分不同的开发领域
keywords = ['软件', '硬件', '前端', '移动', '测试', '数据', '人工智能', '机器学习', '深度学习', '大数据', '云计算', '网络安全', '区块链']

# 创建一个字典，用于映射包含特定关键词的职位到相应的领域
job_categories = {
    '软件': ['软件', '软件开发', '软件工程师'],
    '硬件': ['硬件', '硬件开发', '硬件工程师'],
    '前端': ['前端', '前端开发', '前端工程师'],
    '移动': ['移动', '移动开发', '移动应用'],
    '测试': ['测试', '测试工程师'],
    '数据': ['数据', '数据分析师', '数据挖掘'],
    '人工智能': ['人工智能', 'AI'],
    '机器学习': ['机器学习'],
    '深度学习': ['深度学习'],
    '大数据': ['大数据', '大数据工程师'],
    '云计算': ['云计算', '云'],
    '网络安全': ['网络安全', '网络安全工程师'],
    '区块链': ['区块链', '区块链技术']
}

additional_keywords = ['java', 'python', '.net', 'c++', 'c#', 'sql', '大数据', '人工智能', '机器学习', '深度学习', '前端', '移动', '测试', '云计算', '网络安全', '区块链', '运维', '数据库', '软件', '开发', '工程师', '开发工程师', '开发助理', '技术支持', '项目经理', '产品经理', '销售']

# 定义一个函数，用于根据更详细的关键词对职位进行进一步分类
def further_categorize_job_title(title):
    for keyword in additional_keywords:
        if keyword in title:
            return keyword
    return '其他'

# 对职位名称进行进一步分类
data['Further Job Category'] = data['职位名称'].apply(further_categorize_job_title)

# 提取每个进一步分类的岗位清单
further_job_categories_list = data.groupby('Further Job Category')['职位名称'].unique().reset_index()
further_job_categories_list.columns = ['Further Job Category', 'Job Titles']

further_job_categories_list.to_csv('job_data.csv')


In [None]:
# 1. 岗位分析

# a. 规范化职位名称
def normalize_job_title(title):
    title = ''.join(e for e in title if (e.isalnum() or e.isspace()))
    title = ' '.join(title.split())
    title = title.replace('工程师', '').replace('开发', '').replace('软件', '')
    return title.strip().lower()

data['normalized_job_title'] = data['职位名称'].apply(normalize_job_title)

# b. 提取岗位相关数据
data['Year'] = pd.to_datetime(data['更新日期']).dt.year
job_data = data.groupby(['normalized_job_title', 'Year', '工作地址'])['职位名称'].count().reset_index()
job_data.columns = ['Normalized Job Title', 'Year', 'Province', 'Job Listings']

# 2. 岗位职责分析

# a. 提取岗位职责
def extract_responsibilities(description):
    if pd.isnull(description):
        return []
    lines = description.split('\n')
    responsibility_lines = [line for line in lines if '职责' in line or '负责' in line]
    responsibilities = []
    for line in responsibility_lines:
        res = ':'.join(line.split(':')[1:]).strip()
        res = ''.join(e for e in res if e.isalnum() or e.isspace())
        responsibilities.append(res)
    return responsibilities

data['responsibilities'] = data['岗位描述'].apply(extract_responsibilities)

# b. 提取前10条职责
unique_responsibilities_data = data.groupby('normalized_job_title')['responsibilities'].apply(lambda x: x.head(10)).reset_index(drop=True)

# 3. 岗位能力分析

# a. 提取任职要求
def extract_requirements(qualifications):
    if pd.isnull(qualifications):
        return []
    lines = qualifications.split('\n')
    requirement_lines = [line for line in lines if '要求' in line or '熟悉' in line or '掌握' in line]
    requirements = []
    for line in requirement_lines:
        req = ':'.join(line.split(':')[1:]).strip()
        req = ''.join(e for e in req if e.isalnum() or e.isspace())
        requirements.append(req)
    return requirements

data['requirements'] = data['任职资格'].apply(extract_requirements)

# b. 提取前10条任职要求
top_10_requirements_data = data.groupby('normalized_job_title')['requirements'].apply(lambda x: x.head(10)).reset_index(drop=True)

# 输出结果
job_list = data.groupby('normalized_job_title')['职位名称'].unique().reset_index()
job_list.columns = ['Normalized Job Title', 'Job Titles']
job_data

## 使用Word2Vec模型


In [None]:
import gensim
from gensim.models import KeyedVectors
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import jieba


# 加载模型，需要路径到 sgns.merge.word 文件
model_path = 'sgns.merge.word'  # 示例路径
word_vectors = KeyedVectors.load_word2vec_format(model_path, binary=False)

def preprocess(text):
    return list(jieba.cut(text))  # 使用 jieba 进行分词

def get_mean_vector(word_vectors, words):
    # 获取每个词的词向量并计算平均值
    word_vecs = []
    for word in words:
        if word in word_vectors:
            word_vecs.append(word_vectors[word])
            
    if len(word_vecs) == 0:
        # 如果词组中的词没有一个在词向量模型中，返回零向量
        return np.zeros(word_vectors.vector_size)
    else:
        return np.mean(word_vecs, axis=0)

def calculate_similarity(word_vectors, phrase1, phrase2):
    # 预处理词组
    words1 = preprocess(phrase1)
    words2 = preprocess(phrase2)
    
    # 获取词组的平均词向量
    vector1 = get_mean_vector(word_vectors, words1)
    vector2 = get_mean_vector(word_vectors, words2)
    
    # 计算余弦相似度
    similarity = cosine_similarity([vector1], [vector2])
    return similarity[0][0]



# 定义两个词组
phrase1 = 'NET开发工程师（net Core）'
phrase2 = '.net开发工程师'

# 计算两个词组的相似性
similarity = calculate_similarity(word_vectors, phrase1, phrase2)
print(f"之间的相似度是: {similarity}")

In [None]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances
from gensim.models import KeyedVectors
import numpy as np

# 读取 CSV 文件
file_path = 'Boss直聘/Boss直聘_skills.csv'
data = pd.read_csv(file_path)

# 提取用于聚类的职位名称
job_titles = data['职位名称'].tolist()  # 替换为实际的列名

# 加载预训练的 word2vec 模型
model_path = 'sgns.merge.word'  # 示例路径
word_vectors = KeyedVectors.load_word2vec_format(model_path, binary=False)

# 使用 word2vec 将职位名称转化为向量
def get_average_vector(text, model):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If none of the words are in the model
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

job_vectors = np.array([get_average_vector(title, word_vectors) for title in job_titles])

# 计算余弦距离
cosine_distance_matrix = cosine_distances(job_vectors)



In [13]:
# 使用层次聚类
distance_threshold = 0.4 # 设置距离阈值（可以根据需要调整）
hierarchical = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=distance_threshold)
clusters_hier = hierarchical.fit_predict(cosine_distance_matrix)

# 将聚类标签添加到原始 DataFrame 中
data['Cluster'] = clusters_hier

# 按照 Cluster 列升序排序
data_sorted = data.sort_values(by='Cluster')

# 输出结果
print(data_sorted)

# 如果需要保存结果为文件
data_sorted.to_csv('cluster_word2vec.csv', index=False)

             职位名称                                             工作地址 学历要求  \
47           软件开发                         深圳龙华区星河WORLD招商中心E栋22楼A01   本科   
776          数据处理                                      北京东城区中汇广场9层   本科   
996  RPA开发工程师(英文)                              广州越秀区华盛大厦(先烈中路)2203   本科   
279      .net 程序员                               深圳福田区赛格景苑3楼坐标装饰设计院   大专   
168      .NET 工程师                                  深圳南山区南山区科技园中山大学   本科   
..            ...                                              ...  ...   
386           架构师  北京朝阳区融创动力文化创意产业园(东门)朝阳区广顺北大街5号融创动力科技文化创意产业园B座5层   大专   
474    高级大数据开发工程师                               北京朝阳区北京汇园国际公寓C座711   本科   
734         爬虫实习生                                   北京海淀区中关村创业大街三层   大专   
473      大数据架构或研发                         北京通州区京东集团总部科创十一街京东集团总部大厦   本科   
935       软件研发工程师                                  北京朝阳区林萃路1号院2号楼2   本科   

       工作年限要求  招聘人数        薪资待遇      公司行业  公司性质        公司规模   融资阶段  ... 职位类型  \
47       1-3年   NaN

## 使用Bert


### bert + cosin(职位名称)

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def score(sentence1,sentence2):
    # 初始化 BERT 模型和分词器
    tokenizer = BertTokenizer.from_pretrained('bert-chinese')
    model = BertModel.from_pretrained('bert-chinese')



    # 分词并转换为模型输入
    inputs1 = tokenizer(sentence1, return_tensors="pt")
    inputs2 = tokenizer(sentence2, return_tensors="pt")

    # 通过 BERT 模型获取嵌入
    with torch.no_grad():
        outputs1 = model(**inputs1)
        outputs2 = model(**inputs2)

    # 获取每个句子的嵌入（取所有隐藏层的平均值）
    embeddings1 = torch.mean(outputs1.last_hidden_state, dim=1).numpy()
    embeddings2 = torch.mean(outputs2.last_hidden_state, dim=1).numpy()

    # 计算余弦相似度
    similarity = cosine_similarity(embeddings1, embeddings2)

    print(f"BERT Similarity: {similarity[0][0]}")
# 要比较的两个句子
sentence1 = "NET开发工程师（net Core）"
sentence2 = ".net开发工程师"

In [None]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_distances
from transformers import BertTokenizer, BertModel
import torch


# Load the CSV file into a DataFrame
file_path = 'Boss直聘/Boss直聘_skills.csv'
data = pd.read_csv(file_path)

job_titles = data['职位名称'].tolist()



# 加载预训练的 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-chinese')
model = BertModel.from_pretrained('bert-chinese')

def get_vector(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# 获取所有岗位名称的向量表示
vectors = [get_vector(title) for title in job_titles]



# 假设 job_titles 是你已加载的工作岗位名称列表
# vectors_np 是对应的 BERT 向量表示
vectors_np = np.array(vectors)
cosine_distance_matrix = cosine_distances(vectors_np)

# 设置相似度阈值，即距离阈值
distance_threshold = 0.08 # 这个值可以根据你的需求调整

# 使用层次聚类
hierarchical = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=distance_threshold)
clusters_hier = hierarchical.fit_predict(cosine_distance_matrix)


# 将工作岗位名称和它们的聚类标签存入 DataFrame
data['Cluster'] = clusters_hier

# 按照 Cluster 列升序排序
data_sorted = data.sort_values(by='Cluster')

# 输出结果
for index, row in data_sorted.iterrows():
    print(f'Job Title: {row["职位名称"]} - Cluster: {row["Cluster"]}')

# 如果需要保存结果为文件
data_sorted.to_csv('clustered_job_titles.csv', index=False)

In [19]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import pandas as pd
import spacy
from collections import Counter
import nltk
from nltk.corpus import stopwords

# 下载和加载nltk数据
nltk.download('stopwords')
stop_words = set(stopwords.words('chinese'))

# 加载spacy模型
nlp = spacy.load('zh_core_web_sm/zh_core_web_sm/zh_core_web_sm-3.7.0')

# 加载CSV文件数据
file_path = 'Boss直聘/Boss直聘_skills.csv'
data = pd.read_csv(file_path)
job_titles = data['职位名称'].tolist()

# 加载预训练的 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-chinese')
model = BertModel.from_pretrained('bert-chinese')

def extract_keywords(text, num_keywords=5):
    # 使用spacy进行关键词提取
    doc = nlp(text)
    keywords = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.text not in stop_words]
    word_freq = Counter(keywords)
    common_words = word_freq.most_common(num_keywords)
    return [word for word, freq in common_words]

def get_vector(text):
    keywords = extract_keywords(text)
    concatenated_keywords = " ".join(keywords)
    inputs = tokenizer(concatenated_keywords, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# 获取所有岗位名称的向量表示
vectors = [get_vector(title) for title in job_titles]

# 转换为numpy数组
vectors_np = np.array(vectors)
cosine_distance_matrix = cosine_distances(vectors_np)

# 设置相似度阈值，即距离阈值
distance_threshold = 0.08  # 这个值可以根据你的需求调整

# 使用层次聚类
hierarchical = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=distance_threshold)
clusters_hier = hierarchical.fit_predict(cosine_distance_matrix)

# 将工作岗位名称和它们的聚类标签存入 DataFrame
data['Cluster'] = clusters_hier

# 按照 Cluster 列升序排序
data_sorted = data.sort_values(by='Cluster')

# 输出结果
for index, row in data_sorted.iterrows():
    print(f'Job Title: {row["职位名称"]} - Cluster: {row["Cluster"]}')

# 如果需要保存结果为文件
data_sorted.to_csv('clustered_Bert_spacy.csv', index=False)

[nltk_data] Downloading package stopwords to /home/wxs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 

### bert + cos(职位名称 + 岗位描述)

In [None]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances
from transformers import BertTokenizer, BertModel
import torch

# 读取 CSV 文件
file_path = 'Boss直聘/Boss直聘_skills.csv'
data = pd.read_csv(file_path)

# 提取用于聚类的两列
selected_columns = data[['职位名称', '技术栈']]  # 替换为实际的列名

# 合并两列文本用于生成向量
combined_text = (selected_columns['职位名称'] + ' ' + selected_columns['技术栈']).tolist()

# 加载预训练的 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-chinese')
model = BertModel.from_pretrained('bert-chinese')

# 对每个文本进行分词并生成 BERT 向量
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

bert_vectors = [get_bert_embedding(text) for text in combined_text]



In [None]:
# 计算余弦距离
cosine_distance_matrix = cosine_distances(bert_vectors)

# 使用层次聚类，并替换 affinity 参数为 metric
distance_threshold = 0.03  # 设置距离阈值（可以根据需要调整）
hierarchical = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=distance_threshold)
clusters_hier = hierarchical.fit_predict(cosine_distance_matrix)

# 将聚类标签添加到原始 DataFrame 中
data['Cluster'] = clusters_hier

# 按照 Cluster 列升序排序
data_sorted = data.sort_values(by='Cluster')

# 输出结果
print(data_sorted)

# 如果需要保存结果为文件
data_sorted.to_csv('clustered_job_titles.csv', index=False)

## 使用聚类

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
import numpy as np

# 读取CSV文件，确保源文件是UTF-8编码
file_path = 'change/Boss直聘--修_2.csv'
df = pd.read_csv(file_path, encoding='utf-8')

# 使用“职位名称”列进行处理
df['combined'] = df['工作内容'].fillna('')

# 文本向量化
nltk.download('stopwords')
stop_words = stopwords.words('chinese')
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000)
X = vectorizer.fit_transform(df['combined'])

# 标准化数据
scaler = StandardScaler(with_mean=False).fit(X)
X_scaled = scaler.transform(X)

# 网格搜索参数
best_eps = None
best_min_samples = None
best_score = -1
for eps in np.arange(0.1, 1.1, 0.1):
    for min_samples in range(2, 10):
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
        labels = dbscan.fit_predict(X_scaled)
        num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        num_noise = list(labels).count(-1)
        score = num_clusters - num_noise / float(len(labels))  # 这只是一个简单的指标，你可以用其他指标
        if score > best_score:
            best_score = score
            best_eps = eps
            best_min_samples = min_samples

print(f'最佳eps: {best_eps}, 最佳min_samples: {best_min_samples}, 得分: {best_score}')

# 用最佳参数再次运行DBSCAN
dbscan = DBSCAN(eps=best_eps, min_samples=best_min_samples, metric='cosine')
df['cluster'] = dbscan.fit_predict(X_scaled)

# 按照 cluster 列进行升序排序
df_sorted = df.sort_values(by='cluster')

# 将结果保存到CSV文件，确保输出文件编码为 UTF-8
output_file_path = 'change/dbscan_clustered_sorted_jobs.csv'
df_sorted.to_csv(output_file_path, index=False, encoding='utf-8')

print(f'已完成聚类，结果按cluster升序排序，并保存到 {output_file_path}')