In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('cleaned_data.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         338 non-null    int64  
 1   Job Title          338 non-null    object 
 2   Salary Estimate    338 non-null    object 
 3   Job Description    337 non-null    object 
 4   Rating             338 non-null    float64
 5   Company Name       338 non-null    object 
 6   Location           338 non-null    object 
 7   Headquarters       338 non-null    object 
 8   Size               338 non-null    object 
 9   Founded            338 non-null    int64  
 10  Type of ownership  338 non-null    object 
 11  Industry           338 non-null    object 
 12  Sector             338 non-null    object 
 13  Revenue            338 non-null    object 
 14  Competitors        338 non-null    object 
 15  monthly            338 non-null    int64  
 16  min_salary         338 non

In [5]:
#df_model = df[['Avg Salary','Rating', 'Size','Type of ownership','Industry', 'Sector', 'Revenue','Competitors','hq_base','hq_city','lochq','age','python_yn','spark_yn','aws_yn','excel_yn','job_type','job_seniority','job_len']]

In [6]:
df_model = df[['avg_salary','Rating', 'Size','Type of ownership','Industry', 'Sector', 'Revenue','comp_count','monthly',
             'job_state','same_state','age','python_yn','spark_yn','aws_yn','excel_yn','job_simp','seniority','descr_length']]


KeyError: "['job_simp', 'descr_length', 'seniority', 'comp_count'] not in index"

In [None]:
df_dum = pd.get_dummies(df_model)

In [None]:
x=df_dum.drop(['Avg Salary'],1)
y=df_dum['Avg Salary']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
lm = LinearRegression()

lm.fit(x_train, y_train)

lm_pred=lm.predict(x_test)

print('mae', mean_absolute_error(y_test, lm_pred))

print('cv mae', np.mean(cross_val_score(lm,x_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3)))

In [None]:
lr = Ridge()

lr.fit(x_train, y_train)

lr_pred=lr.predict(x_test)

print('mae', mean_absolute_error(y_test, lr_pred))

print('cv mae', np.mean(cross_val_score(lr,x_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3)))

In [None]:
ll = Lasso()

ll.fit(x_train, y_train)

ll_pred=ll.predict(x_test)

print('mae', mean_absolute_error(y_test, ll_pred))

print('cv mae', np.mean(cross_val_score(ll,x_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3)))

In [None]:
#Multiple Linear Regression/ OLS model 
import statsmodels.api as sm

x_sm = x = sm.add_constant(x)
model = sm.OLS(y,x_sm)
model.fit().summary()

#Second type/ Linear Regression
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score

lm = LinearRegression()
lm.fit(x_train, y_train)

np.mean(cross_val_score(lm,x_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))


In [None]:
# lasso regression 
lm_l = Lasso(alpha=.13)
lm_l.fit(x_train,y_train)
np.mean(cross_val_score(lm_l,x_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))

alpha = []
error = []

for i in range(1,100):
    alpha.append(i/100)
    lml = Lasso(alpha=(i/100))
    error.append(np.mean(cross_val_score(lml,x_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3)))
    
plt.plot(alpha,error)

err = tuple(zip(alpha,error))
df_err = pd.DataFrame(err, columns = ['alpha','error'])
df_err[df_err.error == max(df_err.error)]

In [None]:
# random forest 
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

np.mean(cross_val_score(rf,x_train,y_train,scoring = 'neg_mean_absolute_error', cv= 3))


# tune models GridsearchCV 
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':range(10,300,10), 'criterion':('mse','mae'), 'max_features':('auto','sqrt','log2')}

gs = RandomizedSearchCV(rf,parameters,scoring='neg_mean_absolute_error',cv=3)
gs.fit(x_train,y_train)

gs.best_score_
gs.best_estimator_


In [None]:
# test ensembles 
tpred_lm = lm.predict(x_test)
tpred_lml = lm_l.predict(x_test)
tpred_rf = gs.best_estimator_.predict(x_test)

from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test,tpred_lm))
print(mean_absolute_error(y_test,tpred_lml))
print(mean_absolute_error(y_test,tpred_rf))

print(mean_absolute_error(y_test,(tpred_lm+tpred_rf)/2))