# 1. Import libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

# 2. Load modeling data

In [3]:
df_modeling = pd.read_csv('df_modeling.csv')
df_modeling_drop_first = pd.read_csv('df_modeling_drop_first.csv')
dropped_columns = pd.read_csv('dropped_columns.csv')

In [4]:
df_modeling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Columns: 1014 entries, Unnamed: 0 to Levels_Unknown
dtypes: float64(367), int64(647)
memory usage: 10.4 MB


In [5]:
df_modeling.head()

Unnamed: 0.1,Unnamed: 0,Salary,Analysis skills,Communication skills,Research,R,Machine learning,Bachelor's degree,Master's degree,Doctoral degree,...,"Location_West Sacramento, CA","Location_Westlake Village, CA","Location_Woodland Hills, CA",Levels_Distinguished,Levels_Jr.,Levels_Lead,Levels_Principal,Levels_Sr.,Levels_Staff,Levels_Unknown
0,0,110000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,1,0,0,0,0,0
1,1,120000.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,2,150000.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,1,0
3,3,120000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,4,150000.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
dropped_columns.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,Levels_Distinguished
1,1,Company_23andMe
2,2,"Location_Alameda, CA"
3,3,Title_Applied Scientist


# 3. Model data cleaning

In [12]:
df_modeling = df_modeling.drop('Unnamed: 0', axis=1)

In [13]:
for i in df_modeling.columns[1:]:
    df_modeling.loc[:,i] = df_modeling.loc[:,i].astype('Int64')

In [14]:
X = df_modeling.copy()
y = X.pop('Salary')

# 4. Train test split

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

# 5. Model baselines

The OLS analysis model performed well; however, the majority of the predictors were not statistically significant. In addition, all of the predictors are discrete which creates sparsity in the data and there is high-dimensionality.

Therefore, I will apply the following:
* linear regression (with lasso regularization, ridge regularization, and elastic net regularization)
* tree-based models (specifically random forest)

Since the linear model performed well and there were predictors which are not statistically significant, a regularization technique may reduce potential overfitting. The tree-based models and support vector machines tend to work well on sparse data.

In [55]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std

In [19]:
lm = LinearRegression()
lm.fit(X_train, y_train)
cv = cross_val_score(lm,X_train,y_train,scoring='neg_mean_squared_error',cv=10)
print('LinearRegression')
print(mean(cv), '+/-', std(cv))

LinearRegression
-1.4351869377474988e+30 +/- 1.7994501092402512e+30


In [25]:
lm_l = Lasso(max_iter = 10000, random_state = 1)
cv = cross_val_score(lm_l,X_train,y_train,scoring='neg_mean_squared_error',cv=10)
print('Lasso')
print(mean(cv), '+/-', std(cv))

Lasso
-911538321.3442265 +/- 292298250.7041396


In [28]:
rid = Ridge(max_iter = 10000, random_state = 1)
cv = cross_val_score(rid,X_train,y_train,scoring='neg_mean_squared_error',cv=10)
print('Ridge')
print(mean(cv), '+/-', std(cv))

Ridge
-507765245.32729733 +/- 164382925.5440549


In [30]:
enr = ElasticNet(max_iter = 10000, random_state = 1)
cv = cross_val_score(enr,X_train,y_train,scoring='neg_mean_squared_error',cv=5)
print('ElasticNet')
print(mean(cv), '+/-', std(cv))

ElasticNet
-952939792.7692659 +/- 275975997.56510335


In [31]:
rf = RandomForestRegressor(random_state = 1)
cv = cross_val_score(rf,X_train,y_train,scoring='neg_mean_squared_error',cv=10)
print('RandomForestRegressor')
print(mean(cv), '+/-', std(cv))

RandomForestRegressor
-459244915.9269306 +/- 180045932.81447726


In [56]:
gbr = GradientBoostingRegressor(random_state = 1)
cv = cross_val_score(gbr,X_train,y_train,scoring='neg_mean_squared_error',cv=10)
print('GradientBoostingRegressor')
print(mean(cv), '+/-', std(cv))

GradientBoostingRegressor
-486736029.6924752 +/- 178881458.5929402


# 6. Hyperparameter tuning

In [57]:
from sklearn.model_selection import GridSearchCV 

def reg_performance(regressor, model_name):
    print(model_name)
    print('Best Score: {} +/- {}'.format(str(regressor.best_score_),str(regressor.cv_results_['std_test_score'][regressor.best_index_])))
    print('Best Parameters: ' + str(regressor.best_params_))

In [66]:
lm = LinearRegression()
param_grid = {
                'fit_intercept':[True,False],
                'normalize':[True,False],
                'copy_X':[True, False]
}
reg_lm = GridSearchCV(lm, param_grid = param_grid, cv = 10, scoring='neg_mean_squared_error', n_jobs = -1)
best_reg_lm = reg_lm.fit(X_train,y_train)
reg_performance(best_reg_lm,'LinearRegressor')

LinearRegressor
Best Score: -3.252478268487173e+30 +/- 2.975462297518778e+30
Best Parameters: {'copy_X': True, 'fit_intercept': False, 'normalize': True}


In [70]:
rf = RandomForestRegressor(random_state = 1)
param_grid = {
                'n_estimators': np.arange(10,200,10) , 
                'bootstrap': [True, False],
                'max_depth': np.arange(2,15,2),
                'max_features': ['auto', 'sqrt', 'log2'],
                'min_samples_leaf': np.arange(2,5,1),
                'min_samples_split': np.arange(2,5,1)
              }
reg_rf = GridSearchCV(rf, param_grid = param_grid, cv = 5, scoring='neg_mean_squared_error', n_jobs = -1)
best_reg_rf = reg_rf.fit(X_train,y_train)
reg_performance(best_reg_rf,'RandomForestRegressor')

KeyboardInterrupt: 