## Model Building

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
df = pd.read_csv('eda_data.csv')

In [47]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,age,python,r,spark,aws,excel,job_simp,seniority,desc_len,num_Competitors
0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\r\nLocation: Albuquerque, NM\r\...",3.8,Tecolote Research\r\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,...,47,1,0,0,0,1,data scientist,na,2555,0
1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\r\n\r\nI. General Summary\r\...,3.4,University of Maryland Medical System\r\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,...,36,1,0,0,0,0,data scientist,na,4828,0
2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\r\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,...,10,1,0,1,0,1,data scientist,na,3495,0
3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\r\nJob ID: 310709\r...,3.8,PNNL\r\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,...,55,1,0,0,0,0,data scientist,na,3926,3
4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\r\nAffinity Solutions / Marketi...,2.9,Affinity Solutions\r\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,...,22,1,0,0,0,1,data scientist,na,2748,3


In [7]:
df.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'Hourly', 'Employer_provided', 'min_salary', 'max_salary', 'avg_salary',
       'Company_txt', 'job_state', 'same_state', 'age', 'python', 'r', 'spark',
       'aws', 'excel', 'job_simp', 'seniority', 'desc_len', 'num_Competitors'],
      dtype='object')

In [9]:
#here i selected required columns for model building
model_columns = df[['avg_salary','Rating','Size','Type of ownership', 'Industry', 'Sector', 'Revenue','Hourly', 'Employer_provided','job_state', 'same_state', 'age', 'python', 'r', 'spark',
       'aws', 'excel', 'job_simp', 'seniority', 'desc_len', 'num_Competitors']]

In [11]:
df_dummy = pd.get_dummies(model_columns)
df_dummy

Unnamed: 0,avg_salary,Rating,Hourly,Employer_provided,same_state,age,python,r,spark,aws,...,job_simp_analyst,job_simp_data engineer,job_simp_data scientist,job_simp_director,job_simp_manager,job_simp_mle,job_simp_na,seniority_jr,seniority_na,seniority_sr
0,72.0,3.8,0,0,0,47,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,87.5,3.4,0,0,0,36,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,85.0,4.8,0,0,1,10,1,0,1,0,...,0,0,1,0,0,0,0,0,1,0
3,76.5,3.8,0,0,1,55,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,114.5,2.9,0,0,1,22,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,84.5,3.9,0,0,0,190,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
738,102.5,4.4,0,0,0,14,1,0,1,1,...,0,1,0,0,0,0,0,0,0,1
739,73.5,2.6,0,0,1,36,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
740,127.5,3.2,0,0,0,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [14]:
#Train test split
from sklearn.model_selection import train_test_split
input = df_dummy.drop(['avg_salary'],axis = 1)
target = df_dummy.avg_salary.values
x_train,x_test,y_train,y_test = train_test_split(input, target, test_size =0.3, random_state =40)

### Model testing to low error 

In [21]:
from sklearn.linear_model import LinearRegression ,Lasso
from sklearn.model_selection import cross_val_score

lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
np.mean(cross_val_score(lr,x_train,y_train, scoring='neg_mean_absolute_error',cv=3))

-308535025.3568651

In [35]:
lasso = Lasso(alpha= 0.13)
lasso.fit(x_train,y_train)
np.mean(cross_val_score(lasso,x_train,y_train, scoring='neg_mean_absolute_error',cv=3))

-19.297665640156612

In [27]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
np.mean(cross_val_score(rf,x_train,y_train, scoring='neg_mean_absolute_error',cv=3))

-15.309248554913296

In [30]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'n_estimators' : range(10,200,10),
    'criterion' : ('mse','mae'),
    'max_features' : ('auto','sqrt','log2')
}
gs = GridSearchCV(rf,parameters,scoring = 'neg_mean_absolute_error',cv=3)
gs.fit(x_train,y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [32]:
gs.best_score_

-15.068657675016055

In [34]:
gs.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=30, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [44]:
pre_lr = lr.predict(x_test)
pre_lasso = lasso.predict(x_test)
pre_rf = gs.best_estimator_.predict(x_test)

In [45]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test,pre_lr))
print(mean_absolute_error(y_test,pre_lasso))
print(mean_absolute_error(y_test,pre_rf))

21.79371752831715
21.397500288581885
14.407324364723467


In [46]:
import pickle
pickl = {'model': gs.best_estimator_}
pickle.dump( pickl, open( 'model_file' + ".p", "wb" ) )
