# Imports

In [1]:
import pickle
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

# Dataset

In [2]:
df = pd.read_csv("../data/full_df.csv")

In [3]:
df.head()

Unnamed: 0,title,street,city,country_code,address_text,marker_icon,workplace_type,company_name,company_url,company_size,...,remote,contract_type,salary_min,salary_max,currency,num_technology,technology,levels,salary_mean,company_size_bin
0,UI/UX Designer,Krucza 50,Warszawa,PL,"Krucza 50, Warszawa",ux,office,ShareSpace,http://www.sharespace.work,20,...,False,b2b,9000.0,12000.0,pln,3,"['UX Design', 'UI Design', 'Figma']","[4, 5, 5]",10500.0,small
1,Test Automation Engineer,ul. Krucza 50,Warszawa,PL,"ul. Krucza 50, Warszawa",testing,office,ShareSpace,http://www.sharespace.work,20,...,False,b2b,10000.0,12000.0,pln,3,"['JavaScript', 'Postman', 'Cypress']","[2, 3, 3]",11000.0,small
2,IT Support Specialist,Krucza 50,Warszawa,PL,"Krucza 50, Warszawa",support,office,ShareSpace,http://www.sharespace.work,20,...,False,b2b,6000.0,8000.0,pln,3,"['REST API', 'GSUITE', 'JIRA']","[2, 2, 3]",7000.0,small
3,Programista Wordpress,Centrum,Kraków,PL,"Centrum, Kraków",php,office,L.GRANT,https://www.lgrant.com/,10,...,False,permanent,7000.0,9000.0,pln,3,"['Responsive Web Design', 'WordPress', 'PHP 7']","[3, 3, 3]",8000.0,small
4,Tester / Analyst,Wołoska 5,Warszawa,PL,"Wołoska 5, Warszawa",testing,partly_remote,Turbine Analytics S.A.,http://turbineanalytics.com,33,...,False,b2b,6000.0,11000.0,pln,3,"['Test Automation', 'Testing Tools', 'Testing']","[3, 3, 3]",8500.0,small


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4610 entries, 0 to 4609
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             4610 non-null   object 
 1   street            4606 non-null   object 
 2   city              4610 non-null   object 
 3   country_code      4610 non-null   object 
 4   address_text      4610 non-null   object 
 5   marker_icon       4610 non-null   object 
 6   workplace_type    4610 non-null   object 
 7   company_name      4610 non-null   object 
 8   company_url       4610 non-null   object 
 9   company_size      4610 non-null   int64  
 10  experience_level  4610 non-null   object 
 11  latitude          4610 non-null   float64
 12  longitude         4610 non-null   float64
 13  published_at      4610 non-null   object 
 14  remote_interview  4610 non-null   bool   
 15  id                4610 non-null   object 
 16  employment_types  4610 non-null   object 


In [5]:
to_drop = [
    "title",
    "street",
    "address_text",
    "company_name",
    "company_url",
    "company_size",
    "latitude",
    "longitude",
    "published_at",
    "id",
    "employment_types",
    "company_logo_url",
    "skills",
    "salary_min",
    "salary_max",
    "currency",
    "num_technology",
    "technology",
    "levels",
]

In [6]:
print("Before: ", len(df))
df = df.drop(to_drop, axis=1)
print("After: ", len(df))

Before:  4610
After:  4610


In [7]:
df = df.dropna()

# Utils

In [8]:
def save_model(name, model):
    with open(f"../app/models/{name}", "wb") as file:
        pickle.dump(model, file)

# Categorical data

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

## Label Encoder

In [10]:
cols = ["workplace_type", "country_code", "contract_type", "company_size_bin"]
df[cols] = df[cols].apply(LabelEncoder().fit_transform)
df.head()

Unnamed: 0,city,country_code,marker_icon,workplace_type,experience_level,remote_interview,remote,contract_type,salary_mean,company_size_bin
0,Warszawa,0,ux,0,mid,True,False,0,10500.0,2
1,Warszawa,0,testing,0,mid,True,False,0,11000.0,2
2,Warszawa,0,support,0,junior,True,False,0,7000.0,2
3,Kraków,0,php,0,mid,False,False,2,8000.0,2
4,Warszawa,0,testing,1,mid,True,False,0,8500.0,2


## GetDummies

In [11]:
features_bool = df.select_dtypes(include="bool").columns
df = pd.get_dummies(df, columns=features_bool, drop_first=True)

features_object = df.select_dtypes(include="object").columns
df = pd.get_dummies(df, columns=features_object)

In [12]:
df.head()

Unnamed: 0,country_code,workplace_type,contract_type,salary_mean,company_size_bin,remote_interview_True,remote_True,city_Białystok,city_Bielsko-Biała,city_Bydgoszcz,...,marker_icon_python,marker_icon_ruby,marker_icon_scala,marker_icon_security,marker_icon_support,marker_icon_testing,marker_icon_ux,experience_level_junior,experience_level_mid,experience_level_senior
0,0,0,0,10500.0,2,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,0,0,0,11000.0,2,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,0,0,0,7000.0,2,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,0,0,2,8000.0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,8500.0,2,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


# Modeling

In [13]:
results = pd.DataFrame(columns=["model", "mae"])

In [14]:
X = df.drop(["salary_mean"], axis=1)
y = df["salary_mean"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.3, random_state=2021
)
print(X_train.shape)
print(X_test.shape)

(1050, 81)
(2450, 81)


In [16]:
# model evaluation function

def model_evaluate(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

## RandomForestRegressor

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [18]:
forest_regressor_model = RandomForestRegressor(
    n_estimators=300, max_depth=7, random_state=2021
).fit(X_train, y_train)

cv_model = cross_val_score(
    forest_regressor_model, X_train, y_train, cv=10, scoring="neg_mean_absolute_error"
)
y_preds = forest_regressor_model.predict(X_test)
cv_model_mean = np.mean(cv_model)

print("Cross val score:", cv_model_mean)
print("MAE: ", model_evaluate(y_test, y_preds))

results = results.append(
    {"model": "Forest Regressor Model", "mae": model_evaluate(y_test, y_preds)},
    ignore_index=True,
)
save_model("forest_regressor_model", forest_regressor_model)

Cross val score: -3030.1133786031237
MAE:  2888.8364718211715


## GradientBoostingRegressor

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

In [20]:
gradient_boosting_model = GradientBoostingRegressor(
    n_estimators=300, learning_rate=0.01, max_depth=6, random_state=2021
).fit(X_train, y_train)

cv_model = cross_val_score(
    gradient_boosting_model, X_train, y_train, cv=10, scoring="neg_mean_absolute_error"
)
y_preds = gradient_boosting_model.predict(X_test)
cv_model_mean = np.mean(cv_model)

print("Cross val score:", cv_model_mean)
print("MAE: ", model_evaluate(y_test, y_preds))

results = results.append(
    {"model": "gradient_boosting_model", "mae": model_evaluate(y_test, y_preds)},
    ignore_index=True,
)
save_model("gradient_boosting_model", gradient_boosting_model)

Cross val score: -3005.477323404631
MAE:  2822.4241278385734


## Xgboost

In [21]:
import xgboost as XGB
from sklearn.model_selection import GridSearchCV

In [22]:
xgb_model = XGB.XGBRegressor(
    base_score=0.5,
    booster="gbtree",
    learning_rate=0.1,
    max_depth=4,
    n_estimators=180,
    random_state=2021,
).fit(X_train, y_train)

cv_model = cross_val_score(
    xgb_model, X_train, y_train, cv=10, scoring="neg_mean_absolute_error"
)

y_preds = xgb_model.predict(X_test)
cv_model_mean = np.mean(cv_model)

print("Cross val score:", cv_model_mean)
print("MAE: ", model_evaluate(y_test, y_preds))

results = results.append(
    {"model": "xgb_model", "mae": model_evaluate(y_test, y_preds)}, ignore_index=True
)
xgb_model.save_model("../app/models/xgb_model")

Cross val score: -2889.8088158017113
MAE:  2766.486018689214


## Catboost

In [23]:
from catboost import CatBoostRegressor, Pool

In [24]:
train_dataset = Pool(X_train, y_train)
test_dataset = Pool(X_test, y_test)

catboost_model = CatBoostRegressor(
    iterations=300, learning_rate=0.1, depth=6, silent=True, random_state=2021
).fit(train_dataset)

cv_model = cross_val_score(
    catboost_model, X_train, y_train, cv=10, scoring="neg_mean_absolute_error"
)

y_preds = catboost_model.predict(X_test)
cv_model_mean = np.mean(cv_model)

print("Cross val score:", cv_model_mean)
print("MAE: ", model_evaluate(y_test, y_preds))

results = results.append(
    {"model": "catboost_model", "mae": model_evaluate(y_test, y_preds)},
    ignore_index=True,
)
save_model("catboost_model", catboost_model)

Cross val score: -2896.088110407503
MAE:  2715.127503109727


## Results

In [25]:
results.head()

Unnamed: 0,model,mae
0,Forest Regressor Model,2888.836472
1,gradient_boosting_model,2822.424128
2,xgb_model,2766.486019
3,catboost_model,2715.127503
