In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np


In [2]:
df = pd.read_csv('data_eda.csv')

# pd.set_option('max_columns',31)
# pd.reset_option(“max_columns”)

df.head(3)

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,...,Python,Tableau,Excel,Power BI,SAS,SQL,SSIS,Job Simplified,Seniority,Desc_length
0,1,Entry Level Data Conversion Analyst,$35K - $71K (Glassdoor est.),"As the Data Conversion Analyst, you will be co...",3.4,Reynolds and Reynolds\r\n3.4,"Dayton, OH",1001 to 5000 Employees,1866,Company - Private,...,0,0,0,0,0,0,0,na,na,849
1,3,Senior Data Analyst with Visualization (No spo...,$63K - $149K (Glassdoor est.),Senior Data Analyst with Visualization (No spo...,3.7,HCL Technologies\r\n3.7,"Burlingame, CA",10000+ Employees,1991,Company - Public,...,0,0,0,0,0,0,0,data analyst,senior,910
2,4,Data Analyst,$43K - $88K (Glassdoor est.),Dematic is looking for a Data Analyst that wil...,3.8,Dematic\r\n3.8,"Atlanta, GA",5001 to 10000 Employees,1819,Subsidiary or Business Segment,...,0,0,0,0,0,0,0,data analyst,na,667


In [3]:
#Feature selection
df.columns


Index(['index', 'Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Size', 'Founded', 'Type of ownership',
       'Industry', 'Sector', 'Revenue', 'Hourly', 'Employer provided',
       'Min Salary', 'Max Salary', 'Avg Salary', 'Company_text', 'State',
       'Age', 'Python', 'Tableau', 'Excel', 'Power BI', 'SAS', 'SQL', 'SSIS',
       'Job Simplified', 'Seniority', 'Desc_length'],
      dtype='object')

In [4]:
df_model = df[['Avg Salary','Rating','Size','Type of ownership','Industry','Sector','Revenue','Hourly','Employer provided',
               'State','Age','Excel','Power BI','SQL','SSIS','Job Simplified','Seniority','Desc_length']]
df_model.shape

(527, 18)

In [5]:
# Dummy columnns

df_dum = pd.get_dummies(df_model)
df_dum.shape

(527, 116)

In [6]:
# Train test split

from sklearn.model_selection import train_test_split

X = df_dum.drop('Avg Salary', axis=1)
y = df_dum['Avg Salary'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

linr = LinearRegression()
linr.fit(X_train,y_train)

print(linr.score(X_train,y_train))
print(np.mean(cross_val_score(linr,X_train,y_train,scoring='neg_mean_absolute_error',cv=3)))

0.18493623628720057
-28759702541.842426


In [8]:
#Lasso Regression

from sklearn.linear_model import Lasso

lassr = Lasso(alpha=0.09)
lassr.fit(X_train,y_train)
print(lassr.score(X_train,y_train))
print(np.mean(cross_val_score(lassr,X_train,y_train,scoring='neg_mean_absolute_error',cv=3)))

# Find the best alpha parameter

# alpha = []
# error = []

# for i in range(1,100):
#     alpha.append(i/100)
#     lassr = Lasso(alpha=(i/100))
#     error.append(np.mean(cross_val_score(lassr,X_train,y_train,scoring='neg_mean_absolute_error',cv=3)))

# plt.plot(alpha,error)

# err=tuple(zip(alpha,error))
# df_err = pd.DataFrame(err,columns=['alpha','error'])
# df_err[df_err['error'] == max(df_err['error'])]


0.04030348987716326
-5.294216360832031


In [9]:
#Random forst Regression

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

print(rf.score(X_train,y_train))
print(np.mean(cross_val_score(rf,X_train,y_train,scoring='neg_mean_absolute_error',cv=3)))

0.17673568446074195
-5.140191843581255


In [10]:
# Tune models GridSearchCV
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':range(10,300,10),'criterion':('mse','mae'),'max_features':('auto','sqrt','log2')}

gs = GridSearchCV(rf,parameters,scoring='neg_mean_absolute_error',cv=3)
gs.fit(X_train,y_train)

print(gs.best_score_)
print(gs.best_estimator_)

-4.164934650455927
RandomForestRegressor(criterion='mae', max_features='log2')


In [11]:
test_pred_linr = linr.predict(X_test) #test_pred_linr is y_pred in other terms
test_pred_lassr = lassr.predict(X_test)
test_pred_rf = gs.best_estimator_.predict(X_test)

from sklearn.metrics import mean_absolute_error
print("Linear Regression MAE : ",mean_absolute_error(y_test,test_pred_linr))
print("Lasso Regression MAE : ",mean_absolute_error(y_test,test_pred_lassr))
print("Random Forest Regression MAE : ",mean_absolute_error(y_test,test_pred_rf))

# print(test_pred_rf)
# print(y_test)





Linear Regression MAE :  49700084922.000885
Lasso Regression MAE :  6.287880839938828
Random Forest Regression MAE :  5.547665094339623


In [21]:
##### Optimal model building with PyCaret

numeric_fea = ['Rating','Founded','Min Salary','Max Salary','Age','Desc_length']

from pycaret.regression import *
experiment = setup(df,target='Avg Salary',numeric_features=numeric_fea)

Unnamed: 0,Description,Value
0,session_id,3787
1,Target,Avg Salary
2,Original Data,"(527, 31)"
3,Missing Values,False
4,Numeric Features,7
5,Categorical Features,23
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(368, 121)"


In [22]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,0.3532,7.9833,1.9593,0.8759,0.0263,0.0053,0.089
huber,Huber Regressor,0.1574,8.1662,0.9162,0.8648,0.0114,0.0028,0.023
en,Elastic Net,0.7738,8.9537,1.7993,0.8518,0.0254,0.012,0.006
lasso,Lasso Regression,0.789,8.9639,1.8258,0.8516,0.0259,0.0123,0.006
gbr,Gradient Boosting Regressor,0.381,11.1039,2.0627,0.8359,0.0273,0.0056,0.038
ridge,Ridge Regression,0.5825,10.1555,2.0946,0.8279,0.0284,0.0089,0.007
lr,Linear Regression,0.4511,10.9634,2.2488,0.8136,0.0312,0.0072,0.006
br,Bayesian Ridge,0.4469,10.9672,2.2498,0.8135,0.0312,0.0071,0.023
et,Extra Trees Regressor,0.3514,8.572,2.0742,0.7999,0.0285,0.0053,0.082
dt,Decision Tree Regressor,0.4343,13.5216,2.4193,0.7995,0.0328,0.0067,0.007


In [23]:
print(best)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=3787, verbose=0, warm_start=False)


In [24]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [26]:
# Predict on test set 
predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,0.0681,0.4356,0.66,0.9929,0.0097,0.0011


Unnamed: 0,index,Rating,Founded,Min Salary,Max Salary,Desc_length,Job Title_Business Project Analyst - Data Management (100% Remote),Job Title_Data Analyst,Job Title_Data Analyst Partnerships & Curation,Job Title_Data Quality Analyst,...,SAS_0,SQL_1,SSIS_0,Job Simplified_analytics,Job Simplified_data analyst,Job Simplified_na,Seniority_na,Seniority_senior,Avg Salary,Label
0,8.0,3.6,2002.0,43.0,85.0,514.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,64.0,72.165
1,64.0,3.9,1969.0,50.0,89.0,128.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,69.5,69.500
2,267.0,4.3,1946.0,35.0,71.0,905.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,53.0,53.000
3,917.0,3.6,2002.0,56.0,100.0,514.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,78.0,78.000
4,199.0,3.8,1819.0,50.0,89.0,667.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,69.5,69.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,43.0,4.5,1997.0,53.0,106.0,787.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,79.5,79.500
155,947.0,3.6,2002.0,56.0,100.0,514.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,78.0,78.000
156,602.0,3.4,1866.0,56.0,100.0,849.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,78.0,78.000
157,168.0,3.8,1865.0,47.0,103.0,605.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,75.0,75.000


In [27]:
# finalize the model
finalize_model(best)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=3787, verbose=0, warm_start=False)

In [28]:
#Save the model
save_model(best, 'my_best_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=['Rating', 'Founded',
                                                           'Min Salary',
                                                           'Max Salary', 'Age',
                                                           'Desc_length'],
                                       target='Avg Salary', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_v...
                  RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                        criterion='mse', max_depth=None,
                                        max_features

In [29]:
# create api
create_api(best, 'rf_model_api')


API sucessfully created. This function only creates a POST API, it doesn't run it automatically.

To run your API, please run this command --> !python rf_model_api.py
    


In [31]:
# run api
!python rf_model_api.py

Transformation Pipeline and Model Successfully Loaded


INFO:     Started server process [11224]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
ERROR:    [Errno 10048] error while attempting to bind on address ('127.0.0.1', 8000): une seule utilisation de chaque adresse de socket (protocole/adresse réseau/port) est habituellement autorisée
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.


In [None]:
# Predict on new data 
# predictions = predict_model(best, data=data)
# predictions.head()

In [None]:
# load pipeline
# load_model('my_best_pipeline')