In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("train.csv")
X_train = train.drop(labels=['totalyearlycompensation'], axis=1)
y_train = train.loc[:, 'totalyearlycompensation']
X_train = X_train.select_dtypes(include=['int', 'float']).drop(train.columns[0], axis=1)  # For testing purposes only
X_train

Unnamed: 0,yearsofexperience,yearsatcompany,stockgrantvalue,bonus,cityid,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic
0,2.0,0.0,0.0,20000.0,7839,1,0,0,0,0,0,0,0,0,1
1,15.0,4.0,0.0,23000.0,8909,0,0,0,0,0,0,0,0,0,0
2,2.0,0.0,29000.0,23000.0,10182,0,1,0,0,0,0,1,0,0,0
3,1.0,1.0,65000.0,30000.0,7392,1,0,0,0,0,1,0,0,0,0
4,9.0,0.0,12000.0,15000.0,8816,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50108,12.0,1.0,10000.0,0.0,7419,0,0,0,0,0,0,0,0,0,0
50109,10.0,0.0,39000.0,29000.0,40303,0,0,0,0,0,0,0,0,0,0
50110,6.0,0.0,91000.0,28000.0,7419,0,0,0,0,0,0,0,0,0,0
50111,2.0,2.0,75000.0,0.0,7351,0,0,0,0,0,0,0,0,0,0


# 2. Modeling

## 2.1 Validation

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)
X_val

Unnamed: 0,yearsofexperience,yearsatcompany,stockgrantvalue,bonus,cityid,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic
41851,11.0,11.0,225000.0,51000.0,11521,0,0,0,0,0,0,0,0,0,0
32219,0.0,0.0,22000.0,15000.0,11470,0,1,0,0,0,1,0,0,0,0
45305,5.0,0.0,85000.0,25000.0,7322,0,1,0,0,0,0,1,0,0,0
25924,6.0,6.0,20000.0,20000.0,7419,0,0,0,0,0,0,0,0,0,0
7390,10.0,1.0,0.0,0.0,6583,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27505,25.0,3.0,0.0,0.0,7472,0,0,0,0,0,0,0,0,0,0
33624,5.0,1.0,20000.0,14000.0,10646,0,0,0,0,0,0,0,0,0,0
26070,1.0,1.0,0.0,6000.0,11204,0,0,0,0,0,0,0,0,0,0
10758,15.0,0.0,29000.0,20000.0,12008,0,0,0,0,0,0,0,0,0,0


## 2.2 Accuracy & Error

In [4]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)**0.5

## 2.3 Different Models

### 2.3.1 Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [6]:
lr_train_pred = lr.predict(X_train)
rmse(y_train, lr_train_pred)    # Minimum RMSE on training given X_train

74637.41605606221

In [7]:
lr_val_pred = lr.predict(X_val)
rmse(y_val, lr_val_pred)    # Minimum RMSE on validation given X_train

78331.596485081

### 2.3.2 Random Forests

In [8]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=50)
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=50)

In [9]:
rf_train_pred = rf.predict(X_train)
rmse(y_train, rf_train_pred)

24803.627090245885

In [10]:
rf_val_pred = rf.predict(X_val)
rmse(y_val, rf_val_pred)    # Looks like a case of overfitting since training error >> validation error
                            # Tune parameters, such as max_depth or n_estimators, to reduce overfitting

55001.68257364536

### 2.3.3 Support Vector Regression

In [34]:
from sklearn.svm import SVR
svr = SVR(C=0.001, kernel='linear')

In [35]:
svr.fit(X_train[:10000], y_train[:10000])    # Limiting to 10000 samples b/c SVR takes a while

SVR(C=0.001, kernel='linear')

In [38]:
svr_train_pred = svr.predict(X_train)
rmse(y_train, svr_train_pred)

78540.53106286435

In [39]:
svr_val_pred = svr.predict(X_val)
rmse(y_val, svr_val_pred)

83998.36471966408

### 2.3.4 Ridge Regression

In [None]:
# Write your code here

## 2.4 Choosing the Best Model

In [42]:
X_test = pd.read_csv("test.csv")
X_test = X_test.select_dtypes(include=['int', 'float']).drop(train.columns[0], axis=1)  # For testing purposes only
X_test

Unnamed: 0,yearsofexperience,yearsatcompany,stockgrantvalue,bonus,cityid,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic
0,4.0,0.0,6000.0,15000.0,6580,0,0,0,0,0,0,0,0,0,0
1,10.0,1.0,74000.0,16000.0,8198,0,0,0,0,0,0,0,0,0,0
2,10.0,5.0,40000.0,0.0,1311,0,0,0,0,0,0,0,0,0,0
3,2.0,2.0,0.0,1000.0,9592,0,1,0,0,0,1,0,0,0,0
4,5.0,1.0,15000.0,15000.0,7422,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12524,8.0,3.0,165000.0,0.0,7472,0,0,0,0,0,0,0,0,0,0
12525,17.0,10.0,320000.0,0.0,11527,1,0,0,0,0,1,0,0,0,0
12526,11.0,0.0,30000.0,30000.0,7434,1,0,0,0,0,1,0,0,0,0
12527,10.0,1.0,75000.0,30000.0,11420,0,1,0,0,0,0,1,0,0,0


In [43]:
y_test_pred = rf.predict(X_test)
y_test_pred

array([112200., 254840., 154020., ..., 231360., 295720., 204090.])