## 一、api调用模型处理数据

In [None]:
import requests

def call_large_model_api(user_prompt, user_input):
    url = 'http://117.161.233.106:8000/v1/chat/completions?model=llama'  # 请替换为实际的API URL
    headers = {
        'Content-Type': 'application/json',
        # 请替换为实际的API密钥
    }
    data = {
        "model": "llama3",
        "stream": False,
        "temperature": 0.01,
        "max_tokens": 1024,
        "repetition_penalty": 10,
        "top_p": 0.8,
        "do_sample": True,
        "messages": [
            {
                "role": "system",
                "content": user_prompt
            },
            {
                "role": "user",
                "content": user_input
            }
        ]
    }
    
    response = requests.post(url, json=data, headers=headers)
    
    return response


In [108]:

import concurrent.futures
import csv
from tqdm import tqdm
import requests

# 定义一个函数来处理每一行数据
def process_row(row):
    # 这里是处理行的逻辑
    # 例如，您可以提取入职要求和工作内容，并返回更新后的行
    
    try:
        # 调用大模型API的逻辑（这里需要您自己实现call_large_model_api函数）
        # prompt_1 = "You are a Chinese wise man, always reply in simplified Chinese, not English, otherwise I will be very angry. Extract from the job description I gave you the job requirements or job requirements or what appears to be requirements. Rule: \n- Extract from the job description, do not regenerate. \n- Return format: 工作要求: \n1. \n2.... \n - Always reply Simplified Chinese, not English, otherwise I will be very angry."
        # response = call_large_model_api(prompt_1, row[12])
        # skill = response.json()
        # skills = skill["choices"][0]["message"]["content"]
        
        # prompt_2 = "You are a Chinese wise man, always reply in simplified Chinese, not English, otherwise I will be very angry.Extract the job description from the job description I gave you, not the entry requirement. Rule: \n- Extract from the job description, do not regenerate. \n- Return format: 工作要求: \n1. \n2.... \n - Always reply Simplified Chinese, not English, otherwise I will be very angry."
        # content = call_large_model_api(prompt_2, row[12])
        # content = content.json()
        # content = content["choices"][0]["message"]["content"]
        prompt_3 = "从任职要求提取技术栈\n- Return format: 技术栈: \n1. \n2.... \n - Always reply Simplified Chinese, not English, otherwise I will be very angry."
        response = call_large_model_api(prompt_3, row[18])
        skill = response.json()
        skills = skill["choices"][0]["message"]["content"]
        
    except Exception as e:
        print(f"Error processing row: {e}")
        skills = ""
        # content = ""
    
    row.append(skills)
    # row.append(content)
    return row

# 主函数
def main():
    with open('Boss直聘--修_1.csv', 'r', newline='') as csvfile, \
         open('Boss直聘--修_2.csv', 'w', newline='') as output_file:

        reader = csv.reader(csvfile)
        writer = csv.writer(output_file)
        
        # 写入标题行
        writer.writerow(['职位名称', '工作地址','学历要求', '工作年限要求','招聘人数','薪资待遇','公司行业','公司性质','公司规模','融资阶段','招聘状态','职位类型','岗位描述','公司介绍','公司工商信息','简历详情页地址','更新日期','工作内容','入职要求','技术栈'])
        next(reader)  # 跳过标题行
        
        # 初始化一个列表来存储所有的行，以便并行处理
        rows = []
        for row in reader:
            rows.append(row)
        
        # 使用concurrent.futures并行处理数据
        with concurrent.futures.ProcessPoolExecutor() as executor:
            # 提交所有任务并获取future对象列表
            futures = [executor.submit(process_row, row) for row in rows]
            
            # 初始化计数器和批处理列表
            count = 0
            batch = []
            
            # 遍历future对象，获取结果并写入文件
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing rows", unit="row"):
                # 结果处理
                processed_row = future.result()
                batch.append(processed_row)
                count += 1
                
                # 每100行，写入文件并清空批处理列表
                if count % 100 == 0:
                    writer.writerows(batch)
                    batch.clear()

# 运行主函数
if __name__ == "__main__":
    main()

Processing rows: 100%|██████████| 1000/1000 [13:18<00:00,  1.25row/s]


## 二、整合相似工作岗位

In [109]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'Boss直聘--修_2.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to understand its structure



In [115]:
df['cluster'] = df['职位名称'].str.lower().astype('category').cat.codes + 1

# Sort the DataFrame by the 'cluster' column
sorted_df = df.sort_values(by='cluster')

# Save the sorted DataFrame to a new CSV file
sorted_file_path = 'sorted_out.csv'
sorted_df.to_csv(sorted_file_path, index=False)

sorted_file_path


'sorted_out.csv'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to merge clusters based on job content similarity
def merge_clusters(df, similarity_threshold=0.8):
    # Calculate cosine similarity matrix
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['工作内容'])
    cosine_sim = cosine_similarity(tfidf_matrix)

    # Initialize an empty dictionary to store merged clusters
    merged_clusters = {}

    # Iterate over each row in the DataFrame
    for i in range(len(df)):
        # If the row's cluster is not in the merged_clusters dictionary, add it
        if df.at[i, 'cluster'] not in merged_clusters:
            merged_clusters[df.at[i, 'cluster']] = df.at[i, 'cluster']
        
        # Compare the current row's job content with all other rows
        for j in range(i+1, len(df)):
            # If the cosine similarity is above the threshold and the clusters are different
            if cosine_sim[i][j] > similarity_threshold and df.at[i, 'cluster'] != df.at[j, 'cluster']:
                # Merge the clusters of the two rows
                merged_cluster = min(merged_clusters[df.at[i, 'cluster']], merged_clusters[df.at[j, 'cluster']])
                merged_clusters[df.at[i, 'cluster']] = merged_cluster
                merged_clusters[df.at[j, 'cluster']] = merged_cluster

    # Replace the original clusters with the merged clusters
    df['cluster'] = df['cluster'].map(merged_clusters)

    return df

# Merge clusters based on job content similarity
df = merge_clusters(df)

# Display the DataFrame with the updated 'cluster' column
df.head()
