In [None]:



from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN, KMeans
from collections import defaultdict
import pandas as pd
import numpy as np

class SimilarSkillMerger:
    def __init__(self, model):
        self.model = model

    def merge_similar_skills(self, job_tec_json, threshold=0.6):
        merged_job_tec_json = {}
        for job_name, skills in job_tec_json.items():
            skill_names = list(skills.keys())
            skill_embeddings = self.model.encode(skill_names)
            
            # 计算相似度矩阵并裁剪到 [0, 1] 范围内
            similarities = np.clip(cosine_similarity(skill_embeddings), 0, 1)

            # 将相似度矩阵转换为距离矩阵
            distance_matrix = 1 - similarities
            
            # 使用 DBSCAN 进行聚类
            db = DBSCAN(eps=1-threshold, min_samples=1, metric='precomputed', n_jobs=-1)
            db.fit(distance_matrix)
            
            # 合并聚类结果中的相似技能
            labels = db.labels_
            merged_skills = {}
            
            for label in set(labels):
                indices = [index for index, lab in enumerate(labels) if lab == label]
                if len(indices) > 1:  # 确保聚类中的技能数大于1
                    merged_skill_name = ' & '.join(np.array(skill_names)[indices])
                    merged_skill_value = sum([skills[skill_names[idx]] for idx in indices])
                    merged_skills[merged_skill_name] = merged_skill_value
                else:  # 如果只有一个技能，保留原始的技能名称和计数
                    idx = indices[0]
                    merged_skills[skill_names[idx]] = skills[skill_names[idx]]
            
            # 更新 job_tec_json
            merged_job_tec_json[job_name] = merged_skills

        return merged_job_tec_json

class JobSkillsAnalyzer:
    def __init__(self, csv_path, model):
        self.csv_path = csv_path
        self.model = model
        self.job_skills_dict = self.load_job_skills()
        self.job_tec_json = self.process_job_skills()
        self.skill_merger = SimilarSkillMerger(model)

    def load_job_skills(self):
        job_skills_dict = defaultdict(list)
        df = pd.read_csv(self.csv_path)
        for index, row in df.iterrows():
            skills_str = row['技术栈']
            skills_list = [skill.split('. ', 1)[1].strip() if '. ' in skill else skill.strip() for skill in skills_str.split('\n') if skill.strip()]
            job_skills_dict[row['职位名称']].append(skills_list)
        return job_skills_dict

    def process_job_skills(self):
        job_tec_json = {}
        for job_title, skills_list in self.job_skills_dict.items():
            skill_counts = defaultdict(int)
            for skills in skills_list:
                for skill in skills:
                    skill_counts[skill] += 1
            job_tec_json[job_title] = dict(skill_counts)
        return job_tec_json

    def get_top_skills(self, job_title, N):
        # 获取前N个技能
        skills = self.job_tec_json[job_title]
        sorted_skills = sorted(skills.items(), key=lambda x: x[1], reverse=True)
        return dict(sorted_skills[:N])

    def merge_similar_skills(self, threshold=0.6):
        self.job_tec_json = self.skill_merger.merge_similar_skills(self.job_tec_json, threshold)

    def find_most_similar_job(self, user_input):
        job_titles = list(self.job_tec_json.keys())
        job_embeddings = self.model.encode(job_titles)
        user_embedding = self.model.encode([user_input])

        # 计算用户输入与每个岗位名称的相似度
        similarities = cosine_similarity(user_embedding, job_embeddings)

        # 找到最相似的岗位名称
        most_similar_index = np.argmax(similarities)
        most_similar_job = job_titles[most_similar_index]

        return most_similar_job
    
    def summarize_skills_by_category(self, N=5):
        summaries = {}
        for job_title, skills in self.job_tec_json.items():
            top_skills = self.get_top_skills(job_title, N)
            summary_embedding = np.mean(self.model.encode(list(top_skills.keys())), axis=0)
            closest_skill_index = np.argmin(np.linalg.norm(self.model.encode(list(top_skills.keys())) - summary_embedding, axis=1))
            summary_skill = list(top_skills.keys())[closest_skill_index]
            summaries[job_title] = summary_skill
        return summaries

# 示例用法
csv_path = 'Boss直聘.csv'
model_path = './paraphrase-multilingual-MiniLM-L12-v2'

# 加载模型
model = SentenceTransformer(model_path)
analyzer = JobSkillsAnalyzer(csv_path, model)
analyzer.merge_similar_skills()

# 获取并打印每类技术的总结
summaries = analyzer.summarize_skills_by_category(N=5)
for job_title, summary in summaries.items():
    print(f"{job_title}: {summary}")


