In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score,KFold
from catboost import CatBoostRegressor

In [2]:
df = pd.read_csv('data/final_df.csv')

In [3]:
df.head()

Unnamed: 0,salary_in_usd,company_size_ordinal,experience_level_ordinal,employment_type_FL,employment_type_FT,employment_type_PT,company_location_AL,company_location_AM,company_location_AR,company_location_AS,...,job_title_Principal Data Engineer,job_title_Principal Data Scientist,job_title_Principal Machine Learning Engineer,job_title_Product Data Analyst,job_title_Product Data Scientist,job_title_Research Engineer,job_title_Research Scientist,job_title_Software Data Engineer,job_title_Staff Data Analyst,job_title_Staff Data Scientist
0,85847,2,2,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,30000,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,25500,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,175000,1,2,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,120000,1,2,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
x = df.drop(columns=['salary_in_usd'],axis=1)

In [5]:
y = df['salary_in_usd']

In [6]:
x.head()

Unnamed: 0,company_size_ordinal,experience_level_ordinal,employment_type_FL,employment_type_FT,employment_type_PT,company_location_AL,company_location_AM,company_location_AR,company_location_AS,company_location_AT,...,job_title_Principal Data Engineer,job_title_Principal Data Scientist,job_title_Principal Machine Learning Engineer,job_title_Product Data Analyst,job_title_Product Data Scientist,job_title_Research Engineer,job_title_Research Scientist,job_title_Software Data Engineer,job_title_Staff Data Analyst,job_title_Staff Data Scientist
0,2,2,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
y.head()

0     85847
1     30000
2     25500
3    175000
4    120000
Name: salary_in_usd, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape, x_test.shape

((3004, 245), (751, 245))

In [9]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [10]:
corr_features = correlation(x_train, 0.8)
len(set(corr_features))

47

In [11]:
corr_features

{'employee_residence_AM',
 'employee_residence_AU',
 'employee_residence_BA',
 'employee_residence_BE',
 'employee_residence_BR',
 'employee_residence_CA',
 'employee_residence_CF',
 'employee_residence_CH',
 'employee_residence_CO',
 'employee_residence_CZ',
 'employee_residence_DE',
 'employee_residence_DK',
 'employee_residence_DZ',
 'employee_residence_EG',
 'employee_residence_ES',
 'employee_residence_FI',
 'employee_residence_FR',
 'employee_residence_GB',
 'employee_residence_GH',
 'employee_residence_GR',
 'employee_residence_HN',
 'employee_residence_HR',
 'employee_residence_HU',
 'employee_residence_IE',
 'employee_residence_IN',
 'employee_residence_IQ',
 'employee_residence_IR',
 'employee_residence_JP',
 'employee_residence_KE',
 'employee_residence_LT',
 'employee_residence_LV',
 'employee_residence_MA',
 'employee_residence_MD',
 'employee_residence_MK',
 'employee_residence_MX',
 'employee_residence_NG',
 'employee_residence_NL',
 'employee_residence_NZ',
 'employee_r

In [12]:
x_train.drop(corr_features,axis=1)
x_test.drop(corr_features,axis=1)

Unnamed: 0,company_size_ordinal,experience_level_ordinal,employment_type_FL,employment_type_FT,employment_type_PT,company_location_AL,company_location_AM,company_location_AR,company_location_AS,company_location_AT,...,job_title_Principal Data Engineer,job_title_Principal Data Scientist,job_title_Principal Machine Learning Engineer,job_title_Product Data Analyst,job_title_Product Data Scientist,job_title_Research Engineer,job_title_Research Scientist,job_title_Software Data Engineer,job_title_Staff Data Analyst,job_title_Staff Data Scientist
2148,1,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1044,1,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3321,1,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
439,2,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3519,2,3,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2403,1,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2897,1,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
612,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2677,1,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
model = RandomForestRegressor()
kfold_validation = KFold(5)
results = cross_val_score(model, x_train, y_train, cv=kfold_validation)

model.fit(x_train, y_train)


y_pred_test = model.predict(x_test)


r2_test = r2_score(y_test, y_pred_test) * 100
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)

print("Test Set R2 Score:",round(r2_test,2),"%" )
print("Test Set Mean Absolute Error:",mae_test)
print("Test Set Mean Squared Error:",mse_test)
print("Test Set Root Mean Squared Error:",rmse_test)


Test Set R2 Score: 41.49 %
Test Set Mean Absolute Error: 36306.17274162351
Test Set Mean Squared Error: 2309705199.381228
Test Set Root Mean Squared Error: 48059.3924158559
