In [65]:
import numpy as np
import pandas as pd
from autoviz import AutoViz_Class
import shap
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostRegressor

data = pd.read_csv('salaries.csv')
data


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2024,SE,FT,AI Engineer,90000,USD,90000,AE,0,AE,L
1,2024,SE,FT,Machine Learning Engineer,180500,USD,180500,US,0,US,M
2,2024,SE,FT,Machine Learning Engineer,96200,USD,96200,US,0,US,M
3,2024,SE,FT,Machine Learning Engineer,235000,USD,235000,AU,0,AU,M
4,2024,SE,FT,Machine Learning Engineer,175000,USD,175000,AU,0,AU,M
...,...,...,...,...,...,...,...,...,...,...,...
13967,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
13968,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
13969,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
13970,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


In [66]:
data.info()
useless_columns = ["salary", 'salary_currency']
data = data.drop(useless_columns, axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13972 entries, 0 to 13971
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           13972 non-null  int64 
 1   experience_level    13972 non-null  object
 2   employment_type     13972 non-null  object
 3   job_title           13972 non-null  object
 4   salary              13972 non-null  int64 
 5   salary_currency     13972 non-null  object
 6   salary_in_usd       13972 non-null  int64 
 7   employee_residence  13972 non-null  object
 8   remote_ratio        13972 non-null  int64 
 9   company_location    13972 non-null  object
 10  company_size        13972 non-null  object
dtypes: int64(4), object(7)
memory usage: 1.2+ MB


In [67]:
cat_cols = data.select_dtypes(include='object').columns
int_cols = data.select_dtypes(include="int64").columns

In [68]:
for col in cat_cols:
    print("\n")
    print(col, " : ", data[col].unique())



experience_level  :  ['SE' 'MI' 'EN' 'EX']


employment_type  :  ['FT' 'CT' 'PT' 'FL']


job_title  :  ['AI Engineer' 'Machine Learning Engineer'
 'Business Intelligence Developer' 'Data Engineer' 'Data Scientist'
 'Cloud Database Engineer' 'Research Engineer' 'Data Analyst'
 'Machine Learning Scientist' 'Applied Scientist' 'Data Science Manager'
 'Research Scientist' 'Prompt Engineer' 'Data Science'
 'Data Science Consultant' 'Data Management Analyst' 'Research Analyst'
 'Data Operations Analyst' 'Data Management Consultant'
 'Business Intelligence Analyst' 'Analytics Engineer'
 'Data Quality Analyst' 'Data Architect' 'Data Manager' 'ML Engineer'
 'Robotics Software Engineer' 'Machine Learning Researcher' 'AI Architect'
 'Data DevOps Engineer' 'Business Intelligence' 'AI Software Engineer'
 'Data Integration Engineer' 'Data Operations Specialist' 'BI Analyst'
 'Data Product Manager' 'Business Intelligence Engineer' 'Data Specialist'
 'AI Research Scientist' 'Data Science Director' '

In [69]:
# Преобразуем данные
from feature_engine.encoding import RareLabelEncoder

label = 'salary_in_usd'
data[label] = data[label] * 1e-3 # Зарплата в долларах -> Зарплата в тысячах долларов

percentile = np.percentile(data[label], [1, 99])
data = data[(data[label] > percentile[0]) & data[label] < percentile[1]]

experience_level = {
    "SE": "Senior-level / Expert",
    "MI": "Middle-level / Intermidate",
    "EN": "Entry-level / Junior",
    "EX": "Executive-level / Director"
}
data["experience_level"] = data['experience_level'].replace(experience_level)

data['job_title'].replace("ML Engineer", "Machine Learning Engineer", inplace=True)

employment_type = {
    "FT": "Full-time",
    "CT": "Contract",
    "PT": "Part-time",
    "FL": "Freelance"
}

data["employment_type"] = data['employment_type'].replace(employment_type)

remote_ratio = {
    0: 'No remote work',
    50: 'Half remote work',
    100: "Fully remote work"
}
data['remote_ratio'] = data['remote_ratio'].replace(remote_ratio)

company_size = {
    "L": "Large",
    "M": "Medium",
    "S": "Small"
}
data['company_size'] = data['company_size'].replace(company_size)

for col in ['experience_level', 'job_title', 'employment_type', 'company_location', 'employee_residence']:
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=50, replace_with="Other", tol=20/data.shape[0])
    data[col] = encoder.fit_transform(data[[col]])

In [70]:
data.sample(10).T

Unnamed: 0,1825,2959,783,2555,8571,3065,3994,2965,13485,12465
work_year,2024,2024,2024,2024,2023,2024,2023,2024,2022,2022
experience_level,Senior-level / Expert,Middle-level / Intermidate,Entry-level / Junior,Entry-level / Junior,Senior-level / Expert,Middle-level / Intermidate,Senior-level / Expert,Middle-level / Intermidate,Senior-level / Expert,Middle-level / Intermidate
employment_type,Full-time,Full-time,Full-time,Full-time,Full-time,Full-time,Full-time,Full-time,Full-time,Full-time
job_title,Machine Learning Engineer,Machine Learning Engineer,Data Analyst,Data Analyst,Data Engineer,Data Engineer,Data Scientist,Data Science,Data Engineer,Data Analyst
salary_in_usd,258.8,139.777,61.7,28.75,139.4,115.0,93.3,179.0,220.11,80.0
employee_residence,US,ES,US,GB,US,US,US,US,US,US
remote_ratio,No remote work,Fully remote work,Fully remote work,No remote work,No remote work,Fully remote work,No remote work,No remote work,No remote work,Fully remote work
company_location,US,ES,US,GB,US,US,US,US,US,US
company_size,Medium,Medium,Medium,Medium,Medium,Medium,Medium,Medium,Medium,Large


In [77]:
print(X_train.info())
print(cat_cols)

<class 'pandas.core.frame.DataFrame'>
Index: 6986 entries, 510 to 1796
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           6986 non-null   int64 
 1   experience_level    6986 non-null   object
 2   employment_type     6986 non-null   object
 3   job_title           6986 non-null   object
 4   employee_residence  6986 non-null   object
 5   remote_ratio        6986 non-null   object
 6   company_location    6986 non-null   object
 7   company_size        6986 non-null   object
dtypes: int64(1), object(7)
memory usage: 491.2+ KB
None
Index(['experience_level', 'employment_type', 'job_title',
       'employee_residence', 'remote_ratio', 'company_location',
       'company_size'],
      dtype='object')


In [82]:
y = data['salary_in_usd'].values.reshape(-1,)
X = data.drop(['salary_in_usd'], axis=1)

cat_cols = data.select_dtypes(include='object').columns
cat_cols_idx = [list(X.columns).index(c) for c in cat_cols]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=0, stratify=data[['employee_residence']])

print("Training set shape X_train: {}, y_train: {}".format(X_train.shape, y_train.shape))
print("Test set shape X_test: {}, y_test: {}".format(X_test.shape, y_test.shape))

Training set shape X_train: (6986, 8), y_train: (6986,)
Test set shape X_test: (6986, 8), y_test: (6986,)


In [83]:
cat_cols_idx

[1, 2, 3, 4, 5, 6, 7]

In [84]:
train_pool = Pool(X_train,
                  y_train,
                  cat_features=cat_cols_idx)

test_pool = Pool(X_test,
                 y_test,
                 cat_features=cat_cols_idx)

model = CatBoostRegressor(iterations = 800,
                          depth=6,
                          verbose=1,
                          early_stopping_rounds=100,
                          learning_rate=0.008,
                          loss_function="RMSE")

model.fit(train_pool, eval_set=test_pool)


0:	learn: 69.0289326	test: 67.9507078	best: 67.9507078 (0)	total: 168ms	remaining: 2m 14s
1:	learn: 68.8876955	test: 67.7985961	best: 67.7985961 (1)	total: 212ms	remaining: 1m 24s
2:	learn: 68.7448910	test: 67.6439633	best: 67.6439633 (2)	total: 245ms	remaining: 1m 5s
3:	learn: 68.6027578	test: 67.4925022	best: 67.4925022 (3)	total: 286ms	remaining: 57s
4:	learn: 68.4866629	test: 67.3671364	best: 67.3671364 (4)	total: 313ms	remaining: 49.8s
5:	learn: 68.3468661	test: 67.2209908	best: 67.2209908 (5)	total: 345ms	remaining: 45.7s
6:	learn: 68.2151383	test: 67.0806004	best: 67.0806004 (6)	total: 381ms	remaining: 43.2s
7:	learn: 68.0833589	test: 66.9380056	best: 66.9380056 (7)	total: 418ms	remaining: 41.4s
8:	learn: 67.9515208	test: 66.7972111	best: 66.7972111 (8)	total: 456ms	remaining: 40.1s
9:	learn: 67.8265117	test: 66.6610574	best: 66.6610574 (9)	total: 511ms	remaining: 40.4s
10:	learn: 67.7012070	test: 66.5237919	best: 66.5237919 (10)	total: 558ms	remaining: 40s
11:	learn: 67.5828417

<catboost.core.CatBoostRegressor at 0x20286af9210>

In [None]:
from sklearn.metrics import mean_squared_error

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
rmse_test= mean_squared_error(y_test, y_test_pred, squared=False)

print(f"RMSE score for train {round(rmse_train, 1)} kUSD/year, and for test {round(rmse_test, 1)} kUSD/year")