# 建模方案
为了构建岗位匹配度和求职者满意度的模型，先使用preprocessing进行预处理

1. 对result1-1.csv进行清洗和处理：
+ 去除重复的招聘信息id；
+ 对工作描述进行分词
+ 对员工数量、学历、岗位经验等分类数据进行数字化处理，便于后续计算。
2. 对result1-2.csv进行清洗和处理：
+ 去除重复的求职者id；
+ 对自我评价进行分词
+ 将预期岗位和技能进行关键词提取和分词处理，便于后续计算。

# 1 岗位匹配度

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import cpca
# 读取招聘信息和求职者信息
job_info = pd.read_csv('result1-1_num.csv')
job_seekers = pd.read_csv('result1-2_num.csv')

# 将招聘信息和求职者信息合并，使用交叉连接的方式
job_matching = pd.merge(job_info.assign(key=1), job_seekers.assign(key=1), on='key').drop('key', axis=1)


Unnamed: 0,序号_x,招聘信息id,企业名称,招聘岗位,最低工资,最高工资,工作类型,学历要求,工作经验要求,招聘人数,...,工作性质,期望行业,工作地区,到岗时间,简历关键词,学历,学科专业,学历数值,工作经验数值,自我评价-分词
0,1,1631112859985510400,深圳市海柔创新科技有限公司,会计实习生,3500,7000,2,2,不限,2,...,1,"[""互联网""]","[""北京市"",""北京市"",""朝阳区""]",随时到岗,"['爱唱《雪》', '泰酷辣']",无,无,0,2,"['诞生', '1996', '无', '初中', '黄毛', '高中', '体育', '生..."
1,1,1631112859985510400,深圳市海柔创新科技有限公司,会计实习生,3500,7000,2,2,不限,2,...,0,"[""不限""]","[""广东省"",""广州市"",""天河区""]",时间待议,"['统计专业', 'spss', 'mysql', 'python']",本科,应用统计学,3,1,"['统计', '专业', '学生', '熟悉', 'SPSS', '软件', 'pyhton..."
2,1,1631112859985510400,深圳市海柔创新科技有限公司,会计实习生,3500,7000,2,2,不限,2,...,1,"[""游戏"",""媒体"",""广告营销""]","[""北京市"",""北京市"",""东城区""]",随时到岗,"['都说窝吸你啦', '你耳隆吗', '白胡子', '狐臭', '国服第一男枪']",无,无,0,10,"['国服', '第一', '男枪', '一枪', '泰迪', '全部', '做', '掉',..."
3,1,1631112859985510400,深圳市海柔创新科技有限公司,会计实习生,3500,7000,2,2,不限,2,...,1,"[""不限"",""广告营销"",""媒体""]","[""内蒙古自治区"",""包头市"",""固阳县""]",1周后到岗,"['ikun', '偶像练习生', '唱跳rap篮球', '鸡你太美', '黑色吊带裤']",技工,唱跳rap篮球,1,2,"['全民', '制作', '人们', '大家', '好', '练习', '时', '长', ..."
4,1,1631112859985510400,深圳市海柔创新科技有限公司,会计实习生,3500,7000,2,2,不限,2,...,1,"[""不限""]","[""广东省"",""广州市"",""白云区""]",随时到岗,[],无,无,0,0,"['1', '曾经', '学校', '社团', '里面', '当人', '副', '部长',..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16977215,1570,1463031842339946496,中数通信息有限公司,前端开发工程师,8000,12000,2,3,经验不限,0,...,1,"[""互联网""]",无,无,[],无,无,0,0,['无']
16977216,1570,1463031842339946496,中数通信息有限公司,前端开发工程师,8000,12000,2,3,经验不限,0,...,1,"[""互联网""]",无,无,[],无,无,0,0,['无']
16977217,1570,1463031842339946496,中数通信息有限公司,前端开发工程师,8000,12000,2,3,经验不限,0,...,1,"[""互联网""]",无,无,[],无,无,0,0,['无']
16977218,1570,1463031842339946496,中数通信息有限公司,前端开发工程师,8000,12000,2,3,经验不限,0,...,1,"[""互联网""]",无,无,[],无,无,0,0,['无']


In [2]:
# 求四列数据的最大值
max_value = max(job_matching[['最低工资','最高工资','期望薪资低','期望薪资高']].max())
# 求四列数据的最小值
min_value = min(job_matching[['最低工资','最高工资','期望薪资低','期望薪资高']].min())
# 将薪资归一化
def min_max_normalize(x):
    if min_value == max_value:
        return x
    return (x - min_value) / (max_value - min_value)
job_matching['最低工资'] = job_matching['最低工资'].apply(min_max_normalize)
job_matching['最高工资'] = job_matching['最高工资'].apply(min_max_normalize)
job_matching['期望薪资低'] = job_matching['期望薪资低'].apply(min_max_normalize)
job_matching['期望薪资高'] = job_matching['期望薪资高'].apply(min_max_normalize)

In [3]:
# 定义计算匹配度函数
def calculate_match(row):
    # 判断薪资是否满意，如果不满意则满意度为0
    salary_match = 0
    # 判断薪资匹配度，如果不符合则匹配度为0
    if row['最低工资'] > row['期望薪资高'] or row['最高工资'] < row['期望薪资低']:
        salary_match = 0
    else:
        salary_match += row['最高工资']-row['期望薪资高']
    # 以下内容未爬取，有需要的同学，自己重新爬取后补充完整，丰富自己的模型
    # 计算专业技能匹配度
    if pd.isna(row['简历关键词']):
        row['简历关键词'] = []
    if pd.isna(row['技能关键词(技能关键词id)']):
        row['技能关键词(技能关键词id)'] = []
    skill_match = sum([1 for skill in row['简历关键词'] if skill in row['技能关键词(技能关键词id)']])
    skill_satisfaction = skill_match / len(row['技能关键词(技能关键词id)'])

    # 计算工作经验匹配度
    if row['工作经验数值'] < row['工作经验要求数值']:
        return 0
    else:
        exp_diff = abs(row['工作经验数值'] - row['工作经验要求数值'])
        exp_satisfaction = max(0, 1 - exp_diff/5)

    # 计算学历匹配度
    if row['学历数值'] > row['学历要求']:
        edu_diff = row['学历数值'] - row['学历要求']
        edu_satisfaction = max(0, 1 - edu_diff/3)
    else:
        return 0
    seeker_province = cpca.transform([row['工作地区']])['省'][0]
    seeker_city = cpca.transform([row['工作地区']])['市'][0]
    seeker_district = cpca.transform([row['工作地区']])['区'][0]
    job_province = cpca.transform([row['工作地点']])['省'][0]
    job_city = cpca.transform([row['工作地点']])['市'][0]
    job_district = cpca.transform([row['工作地点']])['区'][0]
    work_loc_satisfaction = 0
    if seeker_province == job_province:
        work_loc_satisfaction += 0.5
    if seeker_city == job_city:
        work_loc_satisfaction += 0.3
    if seeker_district == job_district:
        work_loc_satisfaction += 0.2
    # 求出每个求职者和每个招聘公司之间的相似度得分，并将结果保存到DataFrame中
    # 还有地区距离满意度（待实现）
    # 计算总满意度
    # 权重自己定
    # total_satisfaction = 0.4 * salary_satisfaction + 0.2 * skill_satisfaction + 0.2 * exp_satisfaction + 0.2 * edu_satisfaction
    # 确定一个权重，计算出总满意度，包含薪资满意度和工作经验满意度和学历满意度以及相似度矩阵中的对应数值
    if type(row['求职者id']) == float : 
        total_satisfaction = 0.142703496 * salary_match + 0.150220694 * exp_satisfaction + 0.25707581 * edu_satisfaction + 0.178125 * skill_satisfaction + 0.065625 * work_loc_satisfaction
    else:
        vectorizer = TfidfVectorizer()
        job_desc_tfidf = vectorizer.fit_transform([row['职位描述-分词']])
        self_intro_tfidf = vectorizer.transform([row['自我评价-分词']])
        # 计算自我评价和职位描述之间的相似度,使用余弦相似度,返回一个值
        similarity = cosine_similarity(self_intro_tfidf, job_desc_tfidf)[0][0]
        total_satisfaction = 0.142703496 * salary_match + 0.150220694 * exp_satisfaction + 0.25707581 * edu_satisfaction + 0.178125 * skill_satisfaction + 0.065625 * work_loc_satisfaction + 0.20625 * similarity
    return total_satisfaction

In [4]:


# 计算岗位匹配度
job_matching['匹配度'] = job_matching.apply(calculate_match, axis=1)
# 根据匹配度降序排序
job_matching = job_matching.sort_values(by='匹配度', ascending=False)
# 删除匹配度为0的记录
job_matching = job_matching[job_matching['匹配度'] > 0]
# 保存结果
job_matching[['招聘信息id', '求职者id', '匹配度']].to_csv('result3-1.csv', index=False)


# 2 求职者满意度

In [5]:
import pandas as pd
from ast import literal_eval
# 读取招聘信息和求职者信息
job_info = pd.read_csv('result1-1_num.csv')
job_seekers = pd.read_csv('result1-2_num.csv')
# 使用literal_eval()函数将字符串转换为列表
job_seekers['期望岗位'] = job_seekers['期望岗位'].apply(literal_eval)
# 使用explode()函数将列表中的元素分解成单独的行
job_seekers_all = job_seekers.explode('期望岗位')

job_seekers_all = job_seekers_all.rename(columns={'期望岗位':'岗位名称'})
job_info = job_info.rename(columns={'招聘岗位':'岗位名称'})
# 将招聘信息和求职者信息合并，使用左连接的方式

job_satisfaction = pd.merge(job_info, job_seekers_all, on='岗位名称', how='left')
# pd.merge(job_info.assign(key=1), job_seekers.assign(key=1), on='key').drop('key', axis=1)


In [6]:
# 求四列数据的最大值
max_value = max(job_satisfaction[['最低工资','最高工资','期望薪资低','期望薪资高']].max())
# 求四列数据的最小值
min_value = min(job_satisfaction[['最低工资','最高工资','期望薪资低','期望薪资高']].min())
# 将薪资归一化
def min_max_normalize(x):
    if min_value == max_value:
        return x
    return (x - min_value) / (max_value - min_value)
job_satisfaction['最低工资'] = job_satisfaction['最低工资'].apply(min_max_normalize)
job_satisfaction['最高工资'] = job_satisfaction['最高工资'].apply(min_max_normalize)
job_satisfaction['期望薪资低'] = job_satisfaction['期望薪资低'].apply(min_max_normalize)
job_satisfaction['期望薪资高'] = job_satisfaction['期望薪资高'].apply(min_max_normalize)

In [7]:

# 对于求职者满意度的计算，可以采用类似的方法，将招聘信息和求职者信息合并后，按照求职者的要求和条件进行筛选和计算匹配度。以下是一个基于pandas库的求职者满意度计算方案：
# 定义计算满意度函数
def calculate_satisfaction(row):
    # 判断薪资是否满意，如果不满意则满意度为0
    salary_satisfaction = 0
    if row['最低工资'] > row['期望薪资高'] or row['最高工资'] < row['期望薪资低']:
        salary_satisfaction=0
    if row['期望薪资低']>row['最低工资']:
        salary_satisfaction += row['期望薪资低']-row['最低工资']
    if row['最高工资'] > row['期望薪资高']:
        salary_satisfaction += row['最高工资']-row['期望薪资高']
    # 以下内容未爬取，有需要的同学，自己重新爬取后补充完整，丰富自己的模型
    # 计算专业技能匹配度
    if pd.isna(row['简历关键词']):
        row['简历关键词'] = []
    if pd.isna(row['技能关键词(技能关键词id)']):
        row['技能关键词(技能关键词id)'] = []
    skill_match = sum([1 for skill in row['简历关键词'] if skill in row['技能关键词(技能关键词id)']])
    skill_satisfaction = skill_match / len(row['技能关键词(技能关键词id)'])

    # 计算工作经验匹配度
    if row['工作经验数值'] < row['工作经验要求数值']:
        return 0
    else:
        exp_diff = abs(row['工作经验数值'] - row['工作经验要求数值'])
        exp_satisfaction = max(0, 1 - exp_diff/5)

    # 计算学历匹配度
    if row['学历数值'] > row['学历要求']:
        edu_diff = row['学历数值'] - row['学历要求']
        edu_satisfaction = max(0, 1 - edu_diff/3)
    else:
        return 0
    seeker_province = cpca.transform([row['工作地区']])['省'][0]
    seeker_city = cpca.transform([row['工作地区']])['市'][0]
    seeker_district = cpca.transform([row['工作地区']])['区'][0]
    job_province = cpca.transform([row['工作地点']])['省'][0]
    job_city = cpca.transform([row['工作地点']])['市'][0]
    job_district = cpca.transform([row['工作地点']])['区'][0]
    work_loc_satisfaction = 0
    if seeker_province == job_province:
        work_loc_satisfaction += 0.5
    if seeker_city == job_city:
        work_loc_satisfaction += 0.3
    if seeker_district == job_district:
        work_loc_satisfaction += 0.2
    # 求出每个求职者和每个招聘公司之间的相似度得分，并将结果保存到DataFrame中
    # 还有地区距离满意度
    # total_satisfaction = 0.4 * salary_satisfaction + 0.2 * skill_satisfaction + 0.2 * exp_satisfaction + 0.2 * edu_satisfaction
    # 确定一个权重，计算出总满意度，包含薪资满意度和工作经验满意度和学历满意度以及相似度矩阵中的对应数值
    if type(row['求职者id']) == float : 
        total_satisfaction = 0.283328496 * salary_satisfaction + 0.087720694 * exp_satisfaction + 0.11645081 * edu_satisfaction + 0.128125 * skill_satisfaction + 0.25625 * work_loc_satisfaction
    else:
        vectorizer = TfidfVectorizer()
        job_desc_tfidf = vectorizer.fit_transform([row['职位描述-分词']])
        self_intro_tfidf = vectorizer.transform([row['自我评价-分词']])
        # 计算自我评价和职位描述之间的相似度,使用余弦相似度,返回一个值
        similarity = cosine_similarity(self_intro_tfidf, job_desc_tfidf)[0][0]
        total_satisfaction = 0.283328496 * salary_satisfaction + 0.087720694 * exp_satisfaction + 0.11645081 * edu_satisfaction + 0.128125 * skill_satisfaction + 0.25625 * work_loc_satisfaction + 0.128125 * similarity
    return total_satisfaction


In [8]:

# 计算满意度
job_satisfaction['满意度'] = job_satisfaction.apply(calculate_satisfaction, axis=1)
print('finished')


finished


In [9]:

# 根据满意度降序排序
job_satisfaction = job_satisfaction.sort_values(by='满意度', ascending=False)
# 删除满意度为0的记录
job_satisfaction = job_satisfaction[job_satisfaction['满意度'] > 0]
# 删除重复记录
job_satisfaction = job_satisfaction.drop_duplicates(subset=['求职者id', '招聘信息id'], keep='first')
# 保存结果
job_satisfaction[['求职者id','招聘信息id', '企业名称', '满意度']].to_csv('result3-2.csv', index=False)
