# Загрузка данных

In [1]:
import pandas as pd
import ast
import numpy as np
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
!pip install CatBoost




[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
vac = pd.read_csv('vacancy.csv')
res = pd.read_csv('resume.csv')
res = res.drop(columns = 'salary_currency')
vac = vac[['id', 'experience', 'accept_kids', 'area', 'name', 'description', 'professional_roles', 'key_skills', 'languages']]

## Обработка резюме

In [4]:
def year_ex(i):
    if 'от' in i.lower():
        str = i.lower().split()
        a = int(str[1])
    elif 'более' in i.lower():
        str = i.lower().split()
        a = int(str[1])
    else:
        a = 0
    return a
vac['experience'] = vac['experience'].apply(year_ex)

In [5]:
def calculate_age(birthdate):
    today = datetime.today()
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age

res['birth_date'] = pd.to_datetime(res['birth_date'])
res['age'] = res['birth_date'].apply(calculate_age)

In [6]:
replace_dict = {
    'Higher education (bachelor)': 'Высшее образование (Бакалавр)',
    'Higher education (master)': 'Высшее образование (Магистр)',
    'Higher education': 'Высшее образование',
    'Higher education (Doctor of Science)': 'Высшее образование (Доктор наук)',
    'Incomplete higher education': 'Неоконченное высшее образование',
    'Secondary special education': 'Среднее специальное образование',
    'Education': 'Образование'
}
res['education_level'] = res['education_level'].replace(replace_dict)

In [7]:
def dicts_in_string(input_string):
    data = ast.literal_eval(input_string)
    text = ''
    day = 0
    for i in data:
        if i['end'] is None:
            i['end'] = datetime.today().strftime('%d-%m-%Y')
        day += (datetime.strptime(i['end'], "%d-%m-%Y")-datetime.strptime(i['start'], "%d-%m-%Y")).days
        text += i['description'].replace("\n", "").replace("\r", "").replace("●", "")+'. '
    years = day/365
    return  years, text
res['exp_days'], res['exp_text'] = zip(*res['experience'].apply(dicts_in_string))

In [8]:
def merge_unique_elements(row):
    return np.unique([row['specialization_names'], row['title']])

res['specialization_names'] = res.apply(merge_unique_elements, axis=1)

In [9]:
def dicts_in_lang(input_string):
    data = ast.literal_eval(input_string)
    text = ''
    day = 0
    if len(data) == 0:
        return 0
    for i in data:
        text += i['name'] + ' ' + i['level'] +', '
    return text
res['language'] = res['language'].apply(dicts_in_lang)

In [10]:
def merge_unique_elements2(row):
    text1, text2 = '', ''
    if row['skills'] is not np.NaN:
        text1 = row['skills'].replace("\n", "").replace("\r", "")
    if row['exp_text'] is not np.NaN:
        text2 = row['exp_text'].replace("\n", "").replace("\r", "")
    return text1 + text2

res['skills'] = res.apply(merge_unique_elements2, axis=1)

In [11]:
res = res.drop(columns=['title', 'skill_set', 'exp_text', 'salary_amount', 'birth_date', 'experience'])

## Обработка вакансии

In [12]:
def merge_unique_elements_vac(row):
    return np.unique([*ast.literal_eval(row['professional_roles']), row['name']])

vac['professional_roles'] = vac.apply(merge_unique_elements_vac, axis=1)

In [13]:
def merge_unique_elements2(row):
    text1, text2 = '', ''
    if row['description'] is not np.NaN:
        text1 = row['description'].replace("\n", "").replace("\r", "")
    if row['key_skills'] is not np.NaN:
        text2 = ' '.join(row['key_skills'])
    return text1 + text2
vac['description'] = vac.apply(merge_unique_elements2, axis=1)

In [14]:
def dicts_in_lang_vac(input_string):
    data = ast.literal_eval(input_string)
    text = ''
    day = 0
    if len(data) == 0:
        return 0
    for i in data:
        text += i['name'] + ' ' + i['level']['name'] +', '
    return text
vac['languages'] = vac['languages'].apply(dicts_in_lang_vac)

In [15]:
vac = vac.drop(columns=['name', 'key_skills'])

## Очистка описания вакансий

In [16]:
import re
import nltk
import pymorphy2
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer

nltk.download('stopwords')

stop_words_ru = set(stopwords.words('russian'))
stop_words_en = set(stopwords.words('english'))
stop_words = stop_words_ru.union(stop_words_en)

morph = pymorphy2.MorphAnalyzer()

def clean_text(text):
    if not isinstance(text, str):
        return text
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r" +", " ", text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r'(:\w+:)', '', text)
    tokens = WhitespaceTokenizer().tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    tokens = [morph.parse(token)[0].normal_form for token in tokens]

    return tokens


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mashk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
res['skills'] = res['skills'].apply(lambda x: clean_text(x))

In [None]:
vac['description'] = vac['description'].apply(lambda x: clean_text(x))

## Дополнительные обработка

In [None]:
def count_dicts_in_string(input_string):
    data = ast.literal_eval(input_string)
    count = len(data)
    return count

In [None]:
res['count_education'] = res['education'].apply(count_dicts_in_string)

## Перевод всех текстовых параметров в числа

In [None]:
res = res.reset_index().rename(columns={'index': 'uuid'})

In [None]:
result = vac.merge(res, how='cross')

In [None]:
def check_experience(row):
    required_experience = row['experience']
    actual_experience = row['exp_days']
    return int(actual_experience) >= required_experience

result['satisfies_experience'] = result.apply(check_experience, axis=1)

In [None]:
vacancy_texts = [' '.join(words) for words in vac['professional_roles']]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(vacancy_texts)
tfidf_feature_matrix = tfidf_matrix.toarray()
feature_names = vectorizer.get_feature_names_out()
resume_texts = [' '.join(words) for words in res['specialization_names'].fillna('')]
tfidf_matrix_resume = vectorizer.transform(resume_texts)
tfidf_feature_matrix_resume = tfidf_matrix_resume.toarray()
result_df = 0
cosine_similarities = cosine_similarity(tfidf_feature_matrix, tfidf_feature_matrix_resume)
result['cosine_similarity_prof'] = pd.DataFrame({'cosine_similarity_prof': cosine_similarities.flatten()})

In [None]:
vacancy_texts = [' '.join(words) for words in vac['description']]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(vacancy_texts)
tfidf_feature_matrix = tfidf_matrix.toarray()
feature_names = vectorizer.get_feature_names_out()
resume_texts = [' '.join(words) for words in res['skills'].fillna('')]
tfidf_matrix_resume = vectorizer.transform(resume_texts)
tfidf_feature_matrix_resume = tfidf_matrix_resume.toarray()
cosine_similarities = cosine_similarity(tfidf_feature_matrix, tfidf_feature_matrix_resume)
result['cosine_similarity_desc'] = pd.DataFrame({'cosine_similarity_desc': cosine_similarities.flatten()})

In [None]:
def count_matching_languages(row):
    if row['language'] == 0 or row['languages']==0:
        return 0
    lang1 = set(row['language'].split(', '))
    lang2 = set(row['languages'].split(', '))
    
    # Подсчитываем количество совпадающих элементов (языков с уровнями)
    intersection_count = len(lang1.intersection(lang2))
    total_unique_languages = len(lang1.union(lang2))
    
    if total_unique_languages == 0:
        return 0.001
    else:
        return intersection_count / total_unique_languages

result['match_ratio'] = result.apply(count_matching_languages, axis=1)

In [None]:
## Игрушечная разметка

In [None]:
import random
result['map'] =  result['cosine_similarity_prof'] * result['cosine_similarity_desc'] * result['match_ratio']
result['rank'] = result['map'].rank(method='max', ascending=False).astype(int)
result['markup1'] = np.where(result['rank'] <= 20, 1, 0)
m = result['map'].mean()
result['markup'] = ((result['map'] >= m) & (result['satisfies_experience'])).astype(int)
result['new_markup'] = result['markup'] * result['markup1'] 

## Обучаем модели

In [None]:
res_for_test = pd.get_dummies(res, columns=['education_level'], prefix='edu')

In [None]:
res_for_test = result[['id', 'accept_kids', 'uuid', 'gender', 'education_level', 'age', 'exp_days', 'count_education', 'satisfies_experience', 'cosine_similarity_prof', 'cosine_similarity_desc', 'match_ratio', 'new_markup']]

In [None]:
res_for_test['gender'] = res_for_test['gender'].map({'Male': 0, 'Female': 1})
res_for_test.gender.unique()

In [None]:
res_encoded = pd.get_dummies(res_for_test, columns=['education_level'], prefix='edu')
res_for_test = pd.concat([res_for_test, res_encoded.drop(columns='new_markup')], axis=1)

In [None]:
res_for_test = res_for_test.drop(columns='education_level')

In [None]:
res_for_test.fillna(101, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(res_for_test.drop(columns='new_markup'), res_for_test['new_markup'], test_size = 0.4,random_state=42) 

In [None]:
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

In [None]:
model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

In [None]:
model_catboost = CatBoostClassifier(iterations=500, depth=5, learning_rate=0.05, loss_function='Logloss', random_state=42)
model_catboost.fit(X_train, y_train)
predictions_catboost = model_catboost.predict(X_test)

# качество
accuracy_catboost = accuracy_score(y_test, predictions_catboost)
print(f"Accuracy of CatBoost: {accuracy_catboost}")

In [None]:
model1 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=100, random_state=42)
model2 = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model3 = LogisticRegression(random_state=42)
model_catboost_2 = CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', random_state=42)

ensemble_model = VotingClassifier(estimators=[
    ('gb', model1),
    ('rf', model2),
    ('lr', model3),
    ('catboost', model_catboost_2)
], voting='hard')

ensemble_model.fit(X_train, y_train)
predictions = ensemble_model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy of the ensemble: {accuracy}")

## Используем модель

In [None]:
# probabilities_catboost = model_catboost.predict_proba(merged_result_df)
# threshold = 0.23
# predictions_catboost_class_0 = (probabilities_catboost[:, 0] > threshold).astype(int)
# merged_result_df['prediction_class_0'] = predictions_catboost_class_0
# merged_result_df['probability_class_0'] = probabilities_catboost[:, 0]
# merged_result_df['uuid'] = id_column
# sorted_df = merged_result_df.sort_values(by='probability_class_0', ascending=False)
# sorted_df

ранжирование по count_education, education_level, age, aria