In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy import stats

from sklearn.linear_model import LinearRegression, LogisticRegression
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

%matplotlib inline

In [2]:
df = pd.read_csv('../Processed_Data/prepared.csv', index_col=0)
df.head()

Unnamed: 0,Company,Location,Job_Title,Subspecialty,Total_Comp,Role,Salary Bins
0,Other,San Francisco,Senior Associate,Other,198000.0,Consultant,"(195000.0, 215000.0]"
1,Facebook,Menlo Park,L4,Product,177000.0,Consultant,"(175000.0, 195000.0]"
2,Accenture,San Francisco,Consultant,Other,120000.0,Consultant,"(104000.0, 136000.0]"
3,Salesforce,Atlanta,8,Other,178000.0,Consultant,"(175000.0, 195000.0]"
4,Oracle,San Francisco,IC-4,Consumer,82000.0,Consultant,"(9999.999, 104000.0]"


In [3]:
features = ['Company', 'Location', 'Job_Title', 'Subspecialty', 'Role']
targets = ['Salary Bins', 'Total_Comp']
X = pd.get_dummies(df[features], prefix='', prefix_sep='', drop_first=True).values
y = df['Total_Comp']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

In [5]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_train)
    # pred = np.exp(pred)
    print('Train R2: {}'.format(r2_score(y_train, pred)))
    print('Train MAE: {}'.format(mean_absolute_error(y_train, pred)))

    pred = model.predict(X_test)
    # pred = np.exp(pred)
    print('Test R2: {}'.format(r2_score(y_test, pred)))
    print('Test MAE: {}'.format(mean_absolute_error(y_test, pred)))

In [6]:
lm = LinearRegression()

In [7]:
evaluate_model(lm, X_train, X_test, y_train, y_test)

Train R2: 0.6275235861834796
Train MAE: 41026.46574385154
Test R2: 0.6019997605914541
Test MAE: 42084.14447433854


In [8]:
xgb = XGBRegressor()

In [9]:
evaluate_model(xgb, X_train, X_test, y_train, y_test)

Train R2: 0.7066360896586934
Train MAE: 35970.16540936686
Test R2: 0.6384016456426381
Test MAE: 39725.81699001583
