In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('eda_data.csv')

In [3]:
df.head()

In [4]:
# choose relevant columns 
df.columns 

In [5]:
df_model = df[['avg_salary','Rating','Size','Type of ownership','Industry','Sector','Revenue','num_comp','hourly','employer_provided',
             'job_state','same_state','age','python_yn','spark','aws','excel','job_simp','seniority','desc_len']]
df_model.shape

In [6]:
# Getting dummy data
df_dum = pd.get_dummies(df_model)
df_dum.shape

In [7]:
# Train test split
X = df_dum.drop(columns='avg_salary')
y = df_dum.avg_salary
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
X_train = st.fit_transform(X_train)
X_test = st.transform(X_test)


In [9]:
# Multiple linear regression 
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.model_selection import cross_val_score

lm = LinearRegression()
lm.fit(X_train,y_train)

cross_val_score(lm,X_train,y_train,scoring='neg_mean_absolute_error')

In [10]:
lm_l = Lasso()
lm_l.fit(X_train,y_train)

cross_val_score(lm_l,X_train,y_train,scoring='neg_mean_absolute_error')

In [11]:
from sklearn.ensemble import RandomForestRegressor

lm_rf = RandomForestRegressor()
lm_rf.fit(X_train,y_train)

cross_val_score(lm_rf,X_train,y_train,scoring='neg_mean_absolute_error')

In [12]:
# Tuning using gridsearch 
from sklearn.model_selection import RandomizedSearchCV
    
parameters = {'n_estimators':range(10,300,10), 'criterion':('poisson', 'friedman_mse', 'squared_error', 'absolute_error'), 'max_features':('auto','sqrt','log2')}

gs = RandomizedSearchCV(lm_rf,parameters,scoring='neg_mean_absolute_error')
gs.fit(X_train,y_train)



In [13]:
print('Best parameters are : ',gs.best_params_)
print('Best score is : ',gs.best_score_)

In [14]:
y_pred_lm = lm.predict(X_test)
y_pred_lm_l = lm_l.predict(X_test)
y_pred_gs = gs.best_estimator_.predict(X_test)

In [15]:
from sklearn.metrics import mean_absolute_error,accuracy_score
print(mean_absolute_error(y_test,y_pred_lm)) # for linear regression
print(mean_absolute_error(y_test,y_pred_lm_l)) # for lasso regression
print(mean_absolute_error(y_test,y_pred_gs)) # for randomize cv 


In [16]:
X_test = pd.DataFrame(X_test)

In [17]:
import pickle

In [18]:
file_name = 'trained_model.sav'
pickle.dump(gs, open(file_name,'wb'))


In [19]:
#loading model
loadin_model = pickle.load(open('trained_model.sav','rb'))

In [20]:
X_test_df = pd.DataFrame(X_test)
input_data = X_test_df.iloc[1,:]
input_data = np.asarray(input_data)
input_data = input_data.reshape(1,-1)

y_pred_gs = loadin_model.best_estimator_.predict(input_data)
y_pred_gs