In [18]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import fasttext.util
import numpy as np
import ast
from collections import Counter

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from catboost import CatBoostRegressor, Pool, cv

In [19]:
def get_emb_dataframe(custom_position, dim_size):
    
    ft = fasttext.load_model('cc.ru.300.bin')

    fasttext.util.reduce_model(ft, dim_size)
    
    vector_professions = []
    for words in custom_position:
        vector = np.mean([ft.get_word_vector(word) for word in words.split(' ')], axis=0).tolist()
        vector_professions.append(vector)
        
    df_vector_professions = pd.DataFrame(vector_professions, columns=[f"feat_prof_{c}" for c in range(1, dim_size+1)])
    
    return df_vector_professions

In [20]:
data = pd.read_csv('my_data.csv', index_col=0)

In [21]:
data.columns

Index(['custom_position', 'salary_from', 'work_skills', 'count_skills',
       'skill_1', 'skill_2', 'skill_3', 'skill_4', 'skill_5', 'skill_6',
       'skill_7', 'skill_8', 'skill_9', 'skill_10',
       'schedule_полный рабочий день', 'schedule_свободный график',
       'schedule_сменный график', 'schedule_удаленная работа',
       'schedule_частичная занятость', 'education_name_высшее (бакалавр)',
       'education_name_любое', 'education_name_неполное высшее',
       'education_name_среднее', 'education_name_среднее профессиональное',
       'required_experience_Нет опыта',
       'required_experience_От 1 года до 3 лет',
       'required_experience_От 3 до 6 лет', 'work_skills_str', 'vector_skills',
       'vector_professions'],
      dtype='object')

In [22]:
text_col_name = 'custom_position'

df_vector_professions = get_emb_dataframe(data[text_col_name].values.tolist(), 100)

data.reset_index(drop=True, inplace=True)
df_vector_professions.reset_index(drop=True, inplace=True)

data = pd.concat([data, df_vector_professions], axis=1)

In [23]:
data = data.drop(['custom_position', 'work_skills', 'vector_professions'], axis=1)

### Оставялем эмебддинги как есть

In [24]:
subdata = data.drop(['work_skills_str','skill_1', 'skill_2', 'skill_3', 'skill_4', 'skill_5', 'skill_6','skill_7', 'skill_8', 'skill_9', 'skill_10'], axis=1)

In [25]:
dim = 100

subdata.vector_skills = subdata.vector_skills.apply(ast.literal_eval)

vector_skills = pd.DataFrame(subdata.vector_skills.tolist(), columns=[f"feat_skill_{c}" for c in range(1, dim+1)])

subdata = pd.concat([subdata, vector_skills], axis=1)

In [26]:
y = subdata.salary_from
X = subdata.drop(['salary_from','vector_skills'], axis=1)

num_folds = 5
random_state = 42
scoring = 'neg_mean_absolute_error'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [27]:
model = CatBoostRegressor(iterations=1000, verbose=0)

kfold = KFold(n_splits=num_folds, random_state=random_state, shuffle=True)

cv_results = -1 * cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)

print([round(i,1) for i in cv_results])
print(round(cv_results.mean(),2))

[13713.7, 13417.2, 13639.5, 13573.7, 13553.8]
13579.62


In [28]:
params = {
    'loss_function': 'MAE',
    'iterations': 10000,
    'random_seed': 42,
}

cv_data = cv(
    params=params,
    pool=Pool(X_train, label=y_train),
    fold_count=5,
    shuffle=True,
    partition_random_seed=42,
    plot=True,
    verbose=False,
    early_stopping_rounds=50
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 14092.68073
bestIteration = 9999

Training on fold [1/5]

bestTest = 13558.28096
bestIteration = 9996

Training on fold [2/5]

bestTest = 13550.9816
bestIteration = 9998

Training on fold [3/5]

bestTest = 13587.45571
bestIteration = 9999

Training on fold [4/5]

bestTest = 13282.53887
bestIteration = 9999



In [29]:
cv_data

Unnamed: 0,iterations,test-MAE-mean,test-MAE-std,train-MAE-mean,train-MAE-std
0,0,57502.564865,174.095606,57502.454007,43.798942
1,1,56030.175095,173.761161,56029.884445,44.597347
2,2,54580.136656,173.220157,54579.920427,43.347073
3,3,53202.466762,171.938977,53202.029353,43.600039
4,4,51863.290758,173.953805,51862.803996,42.606263
...,...,...,...,...,...
9995,9995,13614.597859,294.595679,11730.518046,157.742658
9996,9996,13614.561543,294.574840,11730.445470,157.745749
9997,9997,13614.482649,294.532835,11730.239955,157.762394
9998,9998,13614.423284,294.449694,11730.094828,157.591850


### Оставляем один текстовый столбец со всеми 10 навыками

In [30]:
subdata = data.drop(['vector_skills','skill_1', 'skill_2', 'skill_3', 'skill_4', 'skill_5', 'skill_6','skill_7', 'skill_8', 'skill_9', 'skill_10'], axis=1)

In [31]:
subdata.work_skills_str = subdata.work_skills_str.fillna('нет')

In [32]:
y = subdata.salary_from
X = subdata.drop(['salary_from'], axis=1)

num_folds = 5
random_state = 42
scoring = 'neg_mean_absolute_error'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [33]:
params = {
    'loss_function': 'MAE',
    'iterations': 10000,
    'random_seed': 42,
}

cv_data = cv(
    params=params,
    pool=Pool(X_train, label=y_train, text_features=['work_skills_str']),
    fold_count=5,
    shuffle=True,
    partition_random_seed=42,
    plot=True,
    verbose=False,
    early_stopping_rounds=50
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 15475.9497
bestIteration = 3160

Training on fold [1/5]

bestTest = 14684.13496
bestIteration = 8729

Training on fold [2/5]

bestTest = 14969.10869
bestIteration = 3019

Training on fold [3/5]

bestTest = 14301.99388
bestIteration = 8365

Training on fold [4/5]

bestTest = 14393.2057
bestIteration = 6169



In [34]:
cv_data

Unnamed: 0,iterations,test-MAE-mean,test-MAE-std,train-MAE-mean,train-MAE-std
0,0,57492.563691,169.320547,57491.417708,47.376263
1,1,56036.484140,168.414231,56035.193947,47.497284
2,2,54587.198081,167.585743,54586.127157,49.183479
3,3,53211.547910,167.454157,53210.318422,53.905842
4,4,51849.013631,164.349870,51847.280435,55.835399
...,...,...,...,...,...
8775,8775,14765.342683,476.265341,14149.215817,385.974904
8776,8776,14765.342680,476.265342,14149.215812,385.974904
8777,8777,14765.342677,476.265342,14149.215807,385.974903
8778,8778,14765.342664,476.265345,14149.215789,385.974902


### Оставляем 10 текстовых столбцов с навыками

In [35]:
subdata = data.drop(['vector_skills', 'work_skills_str'], axis=1)


In [36]:
skills_cols = ['skill_1', 'skill_2', 'skill_3', 'skill_4', 'skill_5', 'skill_6','skill_7', 'skill_8', 'skill_9', 'skill_10']

In [37]:
subdata[skills_cols] = subdata[skills_cols].fillna('нет')

In [38]:
y = subdata.salary_from
X = subdata.drop(['salary_from'], axis=1)

num_folds = 5
random_state = 42
scoring = 'neg_mean_absolute_error'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [39]:
params = {
    'loss_function': 'MAE',
    'iterations': 10000,
    'random_seed': 42,
}

cv_data = cv(
    params=params,
    pool=Pool(X_train, label=y_train, text_features=skills_cols),
    fold_count=5,
    shuffle=True,
    partition_random_seed=42,
    plot=True,
    verbose=False,
    early_stopping_rounds=50
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 14104.45884
bestIteration = 9999

Training on fold [1/5]

bestTest = 13900.1167
bestIteration = 9591

Training on fold [2/5]

bestTest = 13918.46957
bestIteration = 9999

Training on fold [3/5]

bestTest = 13849.96164
bestIteration = 9999

Training on fold [4/5]

bestTest = 13732.01639
bestIteration = 9751



In [40]:
cv_data

Unnamed: 0,iterations,test-MAE-mean,test-MAE-std,train-MAE-mean,train-MAE-std
0,0,57499.916132,172.481190,57499.870954,43.367510
1,1,56043.522705,171.420215,56043.802008,44.025529
2,2,54631.310989,170.835754,54631.559290,44.660071
3,3,53255.290355,169.608321,53255.427098,48.525834
4,4,51927.876460,169.217582,51928.059963,49.091335
...,...,...,...,...,...
9995,9995,13901.524497,134.882719,11182.733742,64.570323
9996,9996,13901.417522,134.668452,11182.547280,64.766684
9997,9997,13901.389459,134.652256,11182.476543,64.767393
9998,9998,13901.391812,134.670169,11182.402117,64.849664
