In [38]:
import numpy as np
import pandas as pd
from autoviz import AutoViz_Class
import shap

data = pd.read_csv('salaries.csv')
data


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2024,SE,FT,AI Engineer,90000,USD,90000,AE,0,AE,L
1,2024,SE,FT,Machine Learning Engineer,180500,USD,180500,US,0,US,M
2,2024,SE,FT,Machine Learning Engineer,96200,USD,96200,US,0,US,M
3,2024,SE,FT,Machine Learning Engineer,235000,USD,235000,AU,0,AU,M
4,2024,SE,FT,Machine Learning Engineer,175000,USD,175000,AU,0,AU,M
...,...,...,...,...,...,...,...,...,...,...,...
13967,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
13968,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
13969,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
13970,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


In [39]:
data.info()
useless_columns = ["salary", 'salary_currency']
data = data.drop(useless_columns, axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13972 entries, 0 to 13971
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           13972 non-null  int64 
 1   experience_level    13972 non-null  object
 2   employment_type     13972 non-null  object
 3   job_title           13972 non-null  object
 4   salary              13972 non-null  int64 
 5   salary_currency     13972 non-null  object
 6   salary_in_usd       13972 non-null  int64 
 7   employee_residence  13972 non-null  object
 8   remote_ratio        13972 non-null  int64 
 9   company_location    13972 non-null  object
 10  company_size        13972 non-null  object
dtypes: int64(4), object(7)
memory usage: 1.2+ MB


In [40]:
cat_cols = data.select_dtypes(include='object').columns
int_cols = data.select_dtypes(include="int64").columns

In [42]:
for col in cat_cols:
    print("\n")
    print(col, " : ", data[col].unique())



experience_level  :  ['SE' 'MI' 'EN' 'EX']


employment_type  :  ['FT' 'CT' 'PT' 'FL']


job_title  :  ['AI Engineer' 'Machine Learning Engineer'
 'Business Intelligence Developer' 'Data Engineer' 'Data Scientist'
 'Cloud Database Engineer' 'Research Engineer' 'Data Analyst'
 'Machine Learning Scientist' 'Applied Scientist' 'Data Science Manager'
 'Research Scientist' 'Prompt Engineer' 'Data Science'
 'Data Science Consultant' 'Data Management Analyst' 'Research Analyst'
 'Data Operations Analyst' 'Data Management Consultant'
 'Business Intelligence Analyst' 'Analytics Engineer'
 'Data Quality Analyst' 'Data Architect' 'Data Manager' 'ML Engineer'
 'Robotics Software Engineer' 'Machine Learning Researcher' 'AI Architect'
 'Data DevOps Engineer' 'Business Intelligence' 'AI Software Engineer'
 'Data Integration Engineer' 'Data Operations Specialist' 'BI Analyst'
 'Data Product Manager' 'Business Intelligence Engineer' 'Data Specialist'
 'AI Research Scientist' 'Data Science Director' '

In [43]:
# Преобразуем данные
from feature_engine.encoding import RareLabelEncoder

label = 'salary_in_usd'
data[label] = data[label] * 1e-3 # Зарплата в долларах -> Зарплата в тысячах долларов

percentile = np.percentile(data[label], [1, 99])
data = data[(data[label] > percentile[0]) & data[label] < percentile[1]]

experience_level = {
    "SE": "Senior-level / Expert",
    "MI": "Middle-level / Intermidate",
    "EN": "Entry-level / Junior",
    "EX": "Executive-level / Director"
}
data["experience_level"] = data['experience_level'].replace(experience_level)

data['job_title'].replace("ML Engineer", "Machine Learning Engineer", inplace=True)

employment_type = {
    "FT": "Full-time",
    "CT": "Contract",
    "PT": "Part-time",
    "FL": "Freelance"
}

data["employment_type"] = data['employment_type'].replace(employment_type)

remote_ratio = {
    0: 'No remote work',
    50: 'Half remote work',
    100: "Fully remote work"
}
data['remote_ratio'] = data['remote_ratio'].replace(remote_ratio)

company_size = {
    "L": "Large",
    "M": "Medium",
    "S": "Small"
}
data['company_size'] = data['company_size'].replace(company_size)

for col in ['experience_level', 'job_title', 'employment_type', 'company_location', 'employee_residence']:
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=50, replace_with="Other", tol=20/data.shape[0])
    data[col] = encoder.fit_transform(data[[col]])

In [44]:
data

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2024,Senior-level / Expert,Full-time,AI Engineer,90.000,Other,No remote work,Other,Large
1,2024,Senior-level / Expert,Full-time,Machine Learning Engineer,180.500,US,No remote work,US,Medium
2,2024,Senior-level / Expert,Full-time,Machine Learning Engineer,96.200,US,No remote work,US,Medium
3,2024,Senior-level / Expert,Full-time,Machine Learning Engineer,235.000,AU,No remote work,AU,Medium
4,2024,Senior-level / Expert,Full-time,Machine Learning Engineer,175.000,AU,No remote work,AU,Medium
...,...,...,...,...,...,...,...,...,...
13967,2020,Senior-level / Expert,Full-time,Data Scientist,412.000,US,Fully remote work,US,Large
13968,2021,Middle-level / Intermidate,Full-time,Other,151.000,US,Fully remote work,US,Large
13969,2020,Entry-level / Junior,Full-time,Data Scientist,105.000,US,Fully remote work,US,Small
13970,2020,Entry-level / Junior,Contract,Business Data Analyst,100.000,US,Fully remote work,US,Large
