In [1]:
import numpy as np
import pandas as pd
from autoviz import AutoViz_Class
import shap
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostRegressor

data = pd.read_csv('salaries.csv')
data


Imported v0.1.806. Please call AutoViz in this sequence:
    AV = AutoViz_Class()
    %matplotlib inline
    dfte = AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=1, lowess=False,
               chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30, save_plot_dir=None)


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2024,SE,FT,AI Engineer,90000,USD,90000,AE,0,AE,L
1,2024,SE,FT,Machine Learning Engineer,180500,USD,180500,US,0,US,M
2,2024,SE,FT,Machine Learning Engineer,96200,USD,96200,US,0,US,M
3,2024,SE,FT,Machine Learning Engineer,235000,USD,235000,AU,0,AU,M
4,2024,SE,FT,Machine Learning Engineer,175000,USD,175000,AU,0,AU,M
...,...,...,...,...,...,...,...,...,...,...,...
13967,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
13968,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
13969,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
13970,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


In [2]:
data.info()
useless_columns = ["salary", 'salary_currency']
data = data.drop(useless_columns, axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13972 entries, 0 to 13971
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           13972 non-null  int64 
 1   experience_level    13972 non-null  object
 2   employment_type     13972 non-null  object
 3   job_title           13972 non-null  object
 4   salary              13972 non-null  int64 
 5   salary_currency     13972 non-null  object
 6   salary_in_usd       13972 non-null  int64 
 7   employee_residence  13972 non-null  object
 8   remote_ratio        13972 non-null  int64 
 9   company_location    13972 non-null  object
 10  company_size        13972 non-null  object
dtypes: int64(4), object(7)
memory usage: 1.2+ MB


In [3]:
cat_cols = data.select_dtypes(include='object').columns
int_cols = data.select_dtypes(include="int64").columns

In [4]:
for col in cat_cols:
    print("\n")
    print(col, " : ", data[col].unique())



experience_level  :  ['SE' 'MI' 'EN' 'EX']


employment_type  :  ['FT' 'CT' 'PT' 'FL']


job_title  :  ['AI Engineer' 'Machine Learning Engineer'
 'Business Intelligence Developer' 'Data Engineer' 'Data Scientist'
 'Cloud Database Engineer' 'Research Engineer' 'Data Analyst'
 'Machine Learning Scientist' 'Applied Scientist' 'Data Science Manager'
 'Research Scientist' 'Prompt Engineer' 'Data Science'
 'Data Science Consultant' 'Data Management Analyst' 'Research Analyst'
 'Data Operations Analyst' 'Data Management Consultant'
 'Business Intelligence Analyst' 'Analytics Engineer'
 'Data Quality Analyst' 'Data Architect' 'Data Manager' 'ML Engineer'
 'Robotics Software Engineer' 'Machine Learning Researcher' 'AI Architect'
 'Data DevOps Engineer' 'Business Intelligence' 'AI Software Engineer'
 'Data Integration Engineer' 'Data Operations Specialist' 'BI Analyst'
 'Data Product Manager' 'Business Intelligence Engineer' 'Data Specialist'
 'AI Research Scientist' 'Data Science Director' '

In [5]:
# Преобразуем данные
from feature_engine.encoding import RareLabelEncoder

label = 'salary_in_usd'
data[label] = data[label] * 1e-3 # Зарплата в долларах -> Зарплата в тысячах долларов

percentile = np.percentile(data[label], [1, 99])
data = data[(data[label] > percentile[0]) & data[label] < percentile[1]]

experience_level = {
    "SE": "Senior-level / Expert",
    "MI": "Middle-level / Intermidate",
    "EN": "Entry-level / Junior",
    "EX": "Executive-level / Director"
}
data["experience_level"] = data['experience_level'].replace(experience_level)

data['job_title'].replace("ML Engineer", "Machine Learning Engineer", inplace=True)

employment_type = {
    "FT": "Full-time",
    "CT": "Contract",
    "PT": "Part-time",
    "FL": "Freelance"
}

data["employment_type"] = data['employment_type'].replace(employment_type)

remote_ratio = {
    0: 'No remote work',
    50: 'Half remote work',
    100: "Fully remote work"
}
data['remote_ratio'] = data['remote_ratio'].replace(remote_ratio)

company_size = {
    "L": "Large",
    "M": "Medium",
    "S": "Small"
}
data['company_size'] = data['company_size'].replace(company_size)

for col in ['experience_level', 'job_title', 'employment_type', 'company_location', 'employee_residence']:
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=50, replace_with="Other", tol=20/data.shape[0])
    data[col] = encoder.fit_transform(data[[col]])

In [6]:
data.sample(10).T

Unnamed: 0,9724,8956,1061,7861,495,4574,13765,3877,11347,5308
work_year,2023,2023,2024,2023,2024,2023,2021,2023,2023,2023
experience_level,Senior-level / Expert,Senior-level / Expert,Middle-level / Intermidate,Middle-level / Intermidate,Middle-level / Intermidate,Middle-level / Intermidate,Entry-level / Junior,Middle-level / Intermidate,Senior-level / Expert,Senior-level / Expert
employment_type,Full-time,Full-time,Full-time,Full-time,Full-time,Full-time,Full-time,Full-time,Full-time,Full-time
job_title,Data Scientist,Applied Scientist,Data Analyst,Other,Data Scientist,Applied Scientist,Data Analyst,Business Intelligence Analyst,Data Engineer,Data Scientist
salary_in_usd,160.0,309.4,147.0,44.64,73.1,136.0,60.0,96.0,104.0,104.0
employee_residence,US,US,US,US,US,US,US,US,US,CA
remote_ratio,Fully remote work,No remote work,No remote work,Fully remote work,No remote work,No remote work,Fully remote work,Fully remote work,Fully remote work,Fully remote work
company_location,US,US,US,US,US,US,US,US,US,CA
company_size,Medium,Large,Medium,Medium,Medium,Large,Small,Medium,Medium,Medium


In [7]:
y = data['salary_in_usd'].values.reshape(-1,)
X = data.drop(['salary_in_usd'], axis=1)

cat_cols = data.select_dtypes(include='object').columns
cat_cols_idx = [list(X.columns).index(c) for c in cat_cols]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=0, stratify=data[['employee_residence']])

print("Training set shape X_train: {}, y_train: {}".format(X_train.shape, y_train.shape))
print("Test set shape X_test: {}, y_test: {}".format(X_test.shape, y_test.shape))

Training set shape X_train: (6986, 8), y_train: (6986,)
Test set shape X_test: (6986, 8), y_test: (6986,)


In [8]:
train_pool = Pool(X_train,
                  y_train,
                  cat_features=cat_cols_idx)

test_pool = Pool(X_test,
                 y_test,
                 cat_features=cat_cols_idx)

model = CatBoostRegressor(iterations = 800,
                          depth=6,
                          verbose=1,
                          early_stopping_rounds=100,
                          learning_rate=0.008,
                          loss_function="RMSE")

model.fit(train_pool, eval_set=test_pool)


0:	learn: 69.0289326	test: 67.9507078	best: 67.9507078 (0)	total: 167ms	remaining: 2m 13s
1:	learn: 68.8876955	test: 67.7985961	best: 67.7985961 (1)	total: 189ms	remaining: 1m 15s
2:	learn: 68.7448910	test: 67.6439633	best: 67.6439633 (2)	total: 207ms	remaining: 55s
3:	learn: 68.6027578	test: 67.4925022	best: 67.4925022 (3)	total: 224ms	remaining: 44.5s
4:	learn: 68.4866629	test: 67.3671364	best: 67.3671364 (4)	total: 240ms	remaining: 38.1s
5:	learn: 68.3468661	test: 67.2209908	best: 67.2209908 (5)	total: 260ms	remaining: 34.4s
6:	learn: 68.2151383	test: 67.0806004	best: 67.0806004 (6)	total: 277ms	remaining: 31.4s
7:	learn: 68.0833589	test: 66.9380056	best: 66.9380056 (7)	total: 293ms	remaining: 29s
8:	learn: 67.9515208	test: 66.7972111	best: 66.7972111 (8)	total: 309ms	remaining: 27.2s
9:	learn: 67.8265117	test: 66.6610574	best: 66.6610574 (9)	total: 325ms	remaining: 25.7s
10:	learn: 67.7012070	test: 66.5237919	best: 66.5237919 (10)	total: 343ms	remaining: 24.6s
11:	learn: 67.5828417

98:	learn: 61.1390095	test: 59.3809793	best: 59.3809793 (98)	total: 1.9s	remaining: 13.5s
99:	learn: 61.0974813	test: 59.3347163	best: 59.3347163 (99)	total: 1.92s	remaining: 13.5s
100:	learn: 61.0578374	test: 59.2916148	best: 59.2916148 (100)	total: 1.94s	remaining: 13.4s
101:	learn: 61.0308757	test: 59.2636187	best: 59.2636187 (101)	total: 1.95s	remaining: 13.4s
102:	learn: 60.9967866	test: 59.2267688	best: 59.2267688 (102)	total: 1.97s	remaining: 13.3s
103:	learn: 60.9576731	test: 59.1835720	best: 59.1835720 (103)	total: 1.98s	remaining: 13.3s
104:	learn: 60.9219274	test: 59.1453266	best: 59.1453266 (104)	total: 2s	remaining: 13.2s
105:	learn: 60.8843899	test: 59.1050170	best: 59.1050170 (105)	total: 2.01s	remaining: 13.2s
106:	learn: 60.8494112	test: 59.0646954	best: 59.0646954 (106)	total: 2.03s	remaining: 13.2s
107:	learn: 60.8169432	test: 59.0318864	best: 59.0318864 (107)	total: 2.05s	remaining: 13.1s
108:	learn: 60.7807795	test: 58.9924061	best: 58.9924061 (108)	total: 2.06s	re

187:	learn: 59.0109412	test: 57.0873565	best: 57.0873565 (187)	total: 3.39s	remaining: 11s
188:	learn: 58.9976394	test: 57.0715731	best: 57.0715731 (188)	total: 3.41s	remaining: 11s
189:	learn: 58.9866916	test: 57.0593342	best: 57.0593342 (189)	total: 3.42s	remaining: 11s
190:	learn: 58.9737683	test: 57.0455523	best: 57.0455523 (190)	total: 3.44s	remaining: 11s
191:	learn: 58.9594599	test: 57.0310469	best: 57.0310469 (191)	total: 3.45s	remaining: 10.9s
192:	learn: 58.9484416	test: 57.0199870	best: 57.0199870 (192)	total: 3.47s	remaining: 10.9s
193:	learn: 58.9333824	test: 57.0033818	best: 57.0033818 (193)	total: 3.48s	remaining: 10.9s
194:	learn: 58.9198769	test: 56.9888705	best: 56.9888705 (194)	total: 3.5s	remaining: 10.9s
195:	learn: 58.9089376	test: 56.9773725	best: 56.9773725 (195)	total: 3.52s	remaining: 10.8s
196:	learn: 58.8970986	test: 56.9644690	best: 56.9644690 (196)	total: 3.53s	remaining: 10.8s
197:	learn: 58.8855840	test: 56.9524951	best: 56.9524951 (197)	total: 3.55s	rem

276:	learn: 58.2049239	test: 56.2482702	best: 56.2482702 (276)	total: 4.9s	remaining: 9.24s
277:	learn: 58.1965647	test: 56.2416145	best: 56.2416145 (277)	total: 4.91s	remaining: 9.23s
278:	learn: 58.1917697	test: 56.2352677	best: 56.2352677 (278)	total: 4.93s	remaining: 9.21s
279:	learn: 58.1855699	test: 56.2307646	best: 56.2307646 (279)	total: 4.95s	remaining: 9.19s
280:	learn: 58.1790304	test: 56.2244355	best: 56.2244355 (280)	total: 4.96s	remaining: 9.17s
281:	learn: 58.1738983	test: 56.2185882	best: 56.2185882 (281)	total: 4.98s	remaining: 9.15s
282:	learn: 58.1673762	test: 56.2122498	best: 56.2122498 (282)	total: 5s	remaining: 9.14s
283:	learn: 58.1621260	test: 56.2066725	best: 56.2066725 (283)	total: 5.02s	remaining: 9.12s
284:	learn: 58.1575389	test: 56.2015423	best: 56.2015423 (284)	total: 5.04s	remaining: 9.1s
285:	learn: 58.1528171	test: 56.1950068	best: 56.1950068 (285)	total: 5.05s	remaining: 9.08s
286:	learn: 58.1480943	test: 56.1900652	best: 56.1900652 (286)	total: 5.07s

376:	learn: 57.8009911	test: 55.8751772	best: 55.8751772 (376)	total: 6.65s	remaining: 7.46s
377:	learn: 57.7983080	test: 55.8745191	best: 55.8745191 (377)	total: 6.67s	remaining: 7.45s
378:	learn: 57.7948732	test: 55.8712346	best: 55.8712346 (378)	total: 6.69s	remaining: 7.43s
379:	learn: 57.7918649	test: 55.8690670	best: 55.8690670 (379)	total: 6.71s	remaining: 7.42s
380:	learn: 57.7889707	test: 55.8671681	best: 55.8671681 (380)	total: 6.73s	remaining: 7.4s
381:	learn: 57.7864374	test: 55.8653384	best: 55.8653384 (381)	total: 6.75s	remaining: 7.38s
382:	learn: 57.7801877	test: 55.8614713	best: 55.8614713 (382)	total: 6.77s	remaining: 7.37s
383:	learn: 57.7772880	test: 55.8589723	best: 55.8589723 (383)	total: 6.78s	remaining: 7.35s
384:	learn: 57.7757813	test: 55.8581189	best: 55.8581189 (384)	total: 6.8s	remaining: 7.33s
385:	learn: 57.7738405	test: 55.8572262	best: 55.8572262 (385)	total: 6.82s	remaining: 7.32s
386:	learn: 57.7728265	test: 55.8565299	best: 55.8565299 (386)	total: 6.

473:	learn: 57.5771486	test: 55.7078066	best: 55.7078066 (473)	total: 8.61s	remaining: 5.92s
474:	learn: 57.5753938	test: 55.7065794	best: 55.7065794 (474)	total: 8.64s	remaining: 5.91s
475:	learn: 57.5746713	test: 55.7058922	best: 55.7058922 (475)	total: 8.66s	remaining: 5.89s
476:	learn: 57.5719762	test: 55.7047141	best: 55.7047141 (476)	total: 8.69s	remaining: 5.88s
477:	learn: 57.5710879	test: 55.7042137	best: 55.7042137 (477)	total: 8.71s	remaining: 5.87s
478:	learn: 57.5692142	test: 55.7035262	best: 55.7035262 (478)	total: 8.73s	remaining: 5.85s
479:	learn: 57.5672896	test: 55.7020082	best: 55.7020082 (479)	total: 8.75s	remaining: 5.84s
480:	learn: 57.5641599	test: 55.7012295	best: 55.7012295 (480)	total: 8.78s	remaining: 5.82s
481:	learn: 57.5614926	test: 55.6997142	best: 55.6997142 (481)	total: 8.8s	remaining: 5.8s
482:	learn: 57.5589564	test: 55.6983418	best: 55.6983418 (482)	total: 8.82s	remaining: 5.79s
483:	learn: 57.5573917	test: 55.6975224	best: 55.6975224 (483)	total: 8.

574:	learn: 57.4036077	test: 55.6114155	best: 55.6114155 (574)	total: 10.6s	remaining: 4.13s
575:	learn: 57.4004512	test: 55.6102382	best: 55.6102382 (575)	total: 10.6s	remaining: 4.11s
576:	learn: 57.3978374	test: 55.6078791	best: 55.6078791 (576)	total: 10.6s	remaining: 4.09s
577:	learn: 57.3961148	test: 55.6068794	best: 55.6068794 (577)	total: 10.6s	remaining: 4.08s
578:	learn: 57.3950138	test: 55.6065019	best: 55.6065019 (578)	total: 10.6s	remaining: 4.06s
579:	learn: 57.3930461	test: 55.6054283	best: 55.6054283 (579)	total: 10.7s	remaining: 4.04s
580:	learn: 57.3919855	test: 55.6057009	best: 55.6054283 (579)	total: 10.7s	remaining: 4.02s
581:	learn: 57.3903691	test: 55.6043528	best: 55.6043528 (581)	total: 10.7s	remaining: 4s
582:	learn: 57.3887679	test: 55.6035444	best: 55.6035444 (582)	total: 10.7s	remaining: 3.98s
583:	learn: 57.3881670	test: 55.6031588	best: 55.6031588 (583)	total: 10.7s	remaining: 3.96s
584:	learn: 57.3874608	test: 55.6026004	best: 55.6026004 (584)	total: 10.

672:	learn: 57.2769375	test: 55.5512885	best: 55.5512885 (672)	total: 12.3s	remaining: 2.32s
673:	learn: 57.2759109	test: 55.5511718	best: 55.5511718 (673)	total: 12.3s	remaining: 2.3s
674:	learn: 57.2754008	test: 55.5509430	best: 55.5509430 (674)	total: 12.3s	remaining: 2.28s
675:	learn: 57.2736320	test: 55.5505294	best: 55.5505294 (675)	total: 12.4s	remaining: 2.27s
676:	learn: 57.2721615	test: 55.5501784	best: 55.5501784 (676)	total: 12.4s	remaining: 2.25s
677:	learn: 57.2714204	test: 55.5496597	best: 55.5496597 (677)	total: 12.4s	remaining: 2.23s
678:	learn: 57.2707352	test: 55.5497152	best: 55.5496597 (677)	total: 12.4s	remaining: 2.21s
679:	learn: 57.2683337	test: 55.5496709	best: 55.5496597 (677)	total: 12.4s	remaining: 2.19s
680:	learn: 57.2678294	test: 55.5493714	best: 55.5493714 (680)	total: 12.5s	remaining: 2.17s
681:	learn: 57.2666286	test: 55.5487115	best: 55.5487115 (681)	total: 12.5s	remaining: 2.16s
682:	learn: 57.2659033	test: 55.5482635	best: 55.5482635 (682)	total: 1

772:	learn: 57.1608187	test: 55.5132129	best: 55.5132129 (772)	total: 14s	remaining: 490ms
773:	learn: 57.1602099	test: 55.5133493	best: 55.5132129 (772)	total: 14s	remaining: 472ms
774:	learn: 57.1583101	test: 55.5125088	best: 55.5125088 (774)	total: 14.1s	remaining: 454ms
775:	learn: 57.1572693	test: 55.5121218	best: 55.5121218 (775)	total: 14.1s	remaining: 436ms
776:	learn: 57.1566570	test: 55.5121885	best: 55.5121218 (775)	total: 14.1s	remaining: 417ms
777:	learn: 57.1533004	test: 55.5121963	best: 55.5121218 (775)	total: 14.1s	remaining: 399ms
778:	learn: 57.1493951	test: 55.5110442	best: 55.5110442 (778)	total: 14.1s	remaining: 381ms
779:	learn: 57.1488761	test: 55.5103537	best: 55.5103537 (779)	total: 14.2s	remaining: 363ms
780:	learn: 57.1482765	test: 55.5100752	best: 55.5100752 (780)	total: 14.2s	remaining: 345ms
781:	learn: 57.1465216	test: 55.5088378	best: 55.5088378 (781)	total: 14.2s	remaining: 327ms
782:	learn: 57.1444803	test: 55.5088553	best: 55.5088378 (781)	total: 14.2

<catboost.core.CatBoostRegressor at 0x1a1ddde5390>

In [9]:
from sklearn.metrics import mean_squared_error

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
rmse_test= mean_squared_error(y_test, y_test_pred, squared=False)

print(f"RMSE score for train {round(rmse_train, 1)} kUSD/year, and for test {round(rmse_test, 1)} kUSD/year")

RMSE score for train 56.7 kUSD/year, and for test 55.5 kUSD/year


In [10]:
import matplotlib.pyplot as plt
shap.initjs()

explainer = shap.TreeExplainer(model)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)

