**Linear Regression**
1. Preprocess Test data and get predictions
2. Compute Mean Abolute Error, Mean Square error for test data
3. Implement Ridge and Lasso Regression and then compute the following metrics on test data


In [1]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 101)

In [2]:
# Load Data
data = pd.read_csv("https://raw.githubusercontent.com/kartikjindgar/NYU-DataScience-Bootcamp-Fall23/main/Week6/train.csv")

In [3]:
# Drop id, timestamp and country columns
data = data.drop(columns=['id', 'timestamp','country'])

# Replace NANs in hours_per_week with median value of the column
data.loc[data['hours_per_week'].isna(), 'hours_per_week'] = data['hours_per_week'].median()
data.loc[data['telecommute_days_per_week'].isna(), 'telecommute_days_per_week'] = data['telecommute_days_per_week'].median()

# Handling null values in categorical columns
data = data.dropna()

In [4]:
# Create another copy of dataset and append encoded features to it
data_train = data.copy()
data_train.head()

# Select categorical features
cat_cols = [c for c in data_train.columns if data_train[c].dtype == 'object'
            and c not in ['is_manager', 'certifications']]
cat_data = data_train[cat_cols]
print("Categorical Features: ", cat_cols)

# Encoding binary variables
binary_cols = ['is_manager', 'certifications']
for c in binary_cols:
    data_train[c] = data_train[c].replace(to_replace=['Yes'], value=1)
    data_train[c] = data_train[c].replace(to_replace=['No'], value=0)

Categorical Features:  ['employment_status', 'job_title', 'education', 'is_education_computer_related']


In [5]:
# Final Data with One-Hot Encoded Variables
final_data = pd.get_dummies(data_train, columns=cat_cols, drop_first= True)
print("Final Data Shape: ", final_data.shape)
print("Final Data Columns:", final_data.columns)

Final Data Shape:  (4261, 25)
Final Data Columns: Index(['job_years', 'is_manager', 'hours_per_week',
       'telecommute_days_per_week', 'certifications', 'salary',
       'employment_status_Independent or freelancer or company owner',
       'employment_status_Part time', 'job_title_Analytics consultant',
       'job_title_Architect', 'job_title_DBA', 'job_title_Data Scientist',
       'job_title_Developer', 'job_title_Engineer', 'job_title_Manager',
       'job_title_Other', 'job_title_Principal database engineer',
       'job_title_Sales', 'job_title_Sr Consultant ',
       'education_Bachelors (4 years)', 'education_Doctorate/PhD',
       'education_Masters', 'education_None (no degree completed)',
       'is_education_computer_related_Unknown',
       'is_education_computer_related_Yes'],
      dtype='object')


In [6]:
# Train-Test Split
y = final_data['salary']
X = final_data.drop(columns=['salary'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print("Training Set Dimensions:", X_train.shape)
print("Validation Set Dimensions:", X_test.shape)

Training Set Dimensions: (2982, 24)
Validation Set Dimensions: (1279, 24)


In [7]:
# Normalizing Data
num_cols = ['job_years','hours_per_week','telecommute_days_per_week']
scaler = StandardScaler()
scaler.fit(X_train[num_cols])
X_train[num_cols] = scaler.transform(X_train[num_cols])

In [8]:
# Train Model
reg = LinearRegression()
reg.fit(X_train, y_train)
print("Coefficients: ", reg.coef_)
print("Intercept: ", reg.intercept_)

Coefficients:  [ 1.78395235e+02  1.08975106e+02  1.97275890e+02  2.38030143e+02
  1.04469018e+02 -6.17922949e+01  2.36108714e+02 -2.27373675e-13
  1.11780827e+03  8.67345694e+02  7.15364879e+02  5.67930350e+02
  1.02562094e+03  7.68838709e+02  1.83268985e+02  9.99707631e+02
  2.27373675e-13 -3.97903932e-13  3.63989653e+02 -9.04688319e+00
  2.35696529e+02  1.76653882e+02 -3.06772494e+02 -1.98616256e+02]
Intercept:  6097.408758832414


In [9]:
# Get Training Data Predictions
ytr_pred = reg.predict(X_train)

# Training Data Mean Absolute Error and Mean Square Error
print("Training Mean Absolute Error: ", mean_absolute_error(y_train, ytr_pred))
print("Training Mean Square Error: ", mean_squared_error(y_train, ytr_pred)**0.5)

Training Mean Absolute Error:  864.6719438968619
Training Mean Square Error:  1219.2769814672836


In [10]:
# Q1: Preprocess Test data and get predictions
X_test[num_cols] = scaler.transform(X_test[num_cols])
yts_pred = reg.predict(X_test)

In [11]:
# Q2: Compute Mean Abolute Error, Mean Square error for test data
print("Test Mean Absolute Error: ", mean_absolute_error(y_test, yts_pred))
print("Test Mean Square Error: ", mean_squared_error(y_test, yts_pred)**0.5)

Test Mean Absolute Error:  856.9604466694497
Test Mean Square Error:  1211.1102808325873


In [12]:
# Q3: Implement Ridge and Lasso Regression and then compute the following metrics on test data
ridge = Ridge(alpha=1)
ridge.fit(X_train,y_train)
ridge_y_pred = ridge.predict(X_test)
print("Ridge Mean Absolute Error: ", mean_absolute_error(y_test, ridge_y_pred))
print("Ridge Mean Square Error: ", mean_squared_error(y_test, ridge_y_pred)**0.5)

lasso = Lasso(alpha=1)
lasso.fit(X_train,y_train)
lasso_y_pred = lasso.predict(X_test)
print("Lasso Mean Absolute Error: ", mean_absolute_error(y_test, lasso_y_pred))
print("Lasso Mean Square Error: ", mean_squared_error(y_test, lasso_y_pred)**0.5)

Ridge Mean Absolute Error:  856.9764424020428
Ridge Mean Square Error:  1210.9927325240408
Lasso Mean Absolute Error:  856.895232061653
Lasso Mean Square Error:  1210.6743101138686


**Trees**
1. Compute errors on test sets
2. Play with different parameter of decision trees and random forests and see the impact on train and test error
3. [OPTIONAL] implement cross validation and get best hyperparameters

In [13]:
# Train Decision Tree regression model
decisiontree = DecisionTreeRegressor(max_depth = 10, min_samples_split = 5)
decisiontree.fit(X_train, y_train)

# Training Data Predictions
tree_ytr_pred = decisiontree.predict(X_train)

# Evaluating train error
print("Mean Absolute Error: ", mean_absolute_error(y_train, tree_ytr_pred))

Mean Absolute Error:  632.1755837701602


In [14]:
# Q1: Compute errors on test sets
tree_yts_pred = decisiontree.predict(X_test)
print("Mean Absolute Error: ", mean_absolute_error(y_test, tree_yts_pred))

Mean Absolute Error:  898.4274959096119


In [15]:
# Q2: Play with different parameter of decision trees and random forests and see the impact on train and test error
decisiontree2 = DecisionTreeRegressor(max_depth = 8, min_samples_split = 5)
decisiontree2.fit(X_train, y_train)
tree_ytr_pred = decisiontree2.predict(X_train)
tree_yts_pred = decisiontree2.predict(X_test)
print("Max Depth: 8, Min Samples Split: 5")
print("Training Mean Absolute Error: ", mean_absolute_error(y_train, tree_ytr_pred))
print("Test Mean Absolute Error: ", mean_absolute_error(y_test, tree_yts_pred))
print("")

decisiontree3 = DecisionTreeRegressor(max_depth = 6, min_samples_split = 3)
decisiontree3.fit(X_train, y_train)
tree_ytr_pred = decisiontree3.predict(X_train)
tree_yts_pred = decisiontree3.predict(X_test)
print("Max Depth: 6, Min Samples Split: 3")
print("Training Mean Absolute Error: ", mean_absolute_error(y_train, tree_ytr_pred))
print("Test Mean Absolute Error: ", mean_absolute_error(y_test, tree_yts_pred))
print("")

decisiontree4 = DecisionTreeRegressor(max_depth = 4, min_samples_split = 2)
decisiontree4.fit(X_train, y_train)
tree_ytr_pred = decisiontree4.predict(X_train)
tree_yts_pred = decisiontree4.predict(X_test)
print("Max Depth: 4, Min Samples Split: 2")
print("Training Mean Absolute Error: ", mean_absolute_error(y_train, tree_ytr_pred))
print("Test Mean Absolute Error: ", mean_absolute_error(y_test, tree_yts_pred))
print("")

decisiontree5 = DecisionTreeRegressor(max_depth = 12, min_samples_split = 6)
decisiontree5.fit(X_train, y_train)
tree_ytr_pred = decisiontree5.predict(X_train)
tree_yts_pred = decisiontree5.predict(X_test)
print("Max Depth: 12, Min Samples Split: 6")
print("Training Mean Absolute Error: ", mean_absolute_error(y_train, tree_ytr_pred))
print("Test Mean Absolute Error: ", mean_absolute_error(y_test, tree_yts_pred))
print("")

Max Depth: 8, Min Samples Split: 5
Training Mean Absolute Error:  730.9959768352635
Test Mean Absolute Error:  876.6807674097206

Max Depth: 6, Min Samples Split: 3
Training Mean Absolute Error:  808.3148461189426
Test Mean Absolute Error:  870.2085705214363

Max Depth: 4, Min Samples Split: 2
Training Mean Absolute Error:  856.9779536901425
Test Mean Absolute Error:  865.5802942428617

Max Depth: 12, Min Samples Split: 6
Training Mean Absolute Error:  552.6279156683639
Test Mean Absolute Error:  895.7583895033226

