# Training Models

In [1]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
import pandas as pd
import pickle
import numpy as np

In [2]:
# Load y_train, y_test, X_train, X_test

y_train = pd.read_pickle('dataframes/y-train.pkl')
y_val = pd.read_pickle('dataframes/y-val.pkl')
X_train = pd.read_pickle('dataframes/x-train.pkl')
X_val = pd.read_pickle('dataframes/x-val.pkl')

In [8]:
# Get MSE, MAE and MPE for model where we predict the mean of the training set
mean = y_train.mean()
y_pred = np.full(len(y_val), mean)

print("MSE: ", mean_squared_error(y_val, y_pred))
print("MAE: ", mean_absolute_error(y_val, y_pred))
print("MPE: ", mean_absolute_percentage_error(y_val, y_pred))

MSE:  50122494232.54211
MAE:  172921.71471694484
MPE:  0.3477339711282637


In [9]:
# # Calculate f_regression scores for each feature
# from sklearn.feature_selection import f_regression

# # Sort feature names by f_regression scores
# feature_names = X_train.columns
# scores = f_regression(X_train, y_train)[0]
# feature_scores = pd.DataFrame({"feature": feature_names, "score": scores})
# feature_scores.sort_values(by="score", ascending=False)


In [10]:
# Train Linear Regression and evaluate on validation set
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Print MSE and MAE
y_val_pred = lin_reg.predict(X_val)
print("MSE for linear regression model 1 =>", mean_squared_error(y_val, y_val_pred))
print("MAE for linear regression model 1 =>", mean_absolute_error(y_val, y_val_pred))
print("MAPE for linear regression model 1 =>", mean_absolute_percentage_error(y_val, y_val_pred))


MSE for linear regression model 1 => 8499885744.418515
MAE for linear regression model 1 => 69103.22634310293
MAPE for linear regression model 1 => 0.1345694935012263


In [11]:
# Train Ridge Regression and evalue on validation set
from sklearn.linear_model import Ridge

rr = Ridge(alpha=1).fit(X_train, y_train) 

y_val_pred = rr.predict(X_val)
print("MSE for ridge regression alpha 1 =>", mean_squared_error(y_val, y_val_pred))
print("MAE for ridge regression alpha 1 =>", mean_absolute_error(y_val, y_val_pred))

rr = Ridge(alpha=10).fit(X_train, y_train) 

y_val_pred = rr.predict(X_val)
print("MSE for ridge regression alpha 10 =>", mean_squared_error(y_val, y_val_pred))
print("MAE for ridge regression alpha 10 =>", mean_absolute_error(y_val, y_val_pred))

rr = Ridge(alpha=100).fit(X_train, y_train) 

y_val_pred = rr.predict(X_val)
print("MSE for ridge regression alpha 100 =>", mean_squared_error(y_val, y_val_pred))
print("MAE for ridge regression alpha 100 =>", mean_absolute_error(y_val, y_val_pred))

rr = Ridge(alpha=1000).fit(X_train, y_train) 

y_val_pred = rr.predict(X_val)
print("MSE for ridge regression alpha 1000 =>", mean_squared_error(y_val, y_val_pred))
print("MAE for ridge regression alpha 1000 =>", mean_absolute_error(y_val, y_val_pred))

rr = Ridge(alpha=10000).fit(X_train, y_train) 

y_val_pred = rr.predict(X_val)
print("MSE for ridge regression alpha 10000 =>", mean_squared_error(y_val, y_val_pred))
print("MAE for ridge regression alpha 10000 =>", mean_absolute_error(y_val, y_val_pred))

rr = Ridge(alpha=100000).fit(X_train, y_train) 

y_val_pred = rr.predict(X_val)
print("MSE for ridge regression alpha 10000 =>", mean_squared_error(y_val, y_val_pred))
print("MAE for ridge regression alpha 10000 =>", mean_absolute_error(y_val, y_val_pred))


MSE for ridge regression alpha 1 => 8499870041.984717
MAE for ridge regression alpha 1 => 69102.9129123165
MSE for ridge regression alpha 10 => 8500228827.740491
MAE for ridge regression alpha 10 => 69104.44644854168
MSE for ridge regression alpha 100 => 8513022182.256242
MAE for ridge regression alpha 100 => 69144.09874053
MSE for ridge regression alpha 1000 => 8650154765.658485
MAE for ridge regression alpha 1000 => 69434.42809272427
MSE for ridge regression alpha 10000 => 9363121887.674316
MAE for ridge regression alpha 10000 => 71598.28678661454
MSE for ridge regression alpha 10000 => 10539176137.759523
MAE for ridge regression alpha 10000 => 75718.30375667359


In [None]:
# # Train support vector regression and evaluate on validation set
from sklearn.svm import SVR

svr = SVR(kernel="linear", C=1, epsilon=0.1)
svr.fit(X_train, y_train)

y_val_pred = svr.predict(X_val)
print("MSE for SVR =>", mean_squared_error(y_val, y_val_pred))
print("MAE for SVR =>", mean_absolute_error(y_val, y_val_pred))

In [3]:
# Train Decison Tree Regressor and evaluate on validation set
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42).fit(X_train, y_train)

y_val_pred = tree_reg.predict(X_val)
print("MSE for decision tree regression model 1 =>", mean_squared_error(y_val, y_val_pred))
print("MAE for decision tree regression model 1 =>", mean_absolute_error(y_val, y_val_pred))
print("MAPE for decision tree regression model 1 =>", mean_absolute_percentage_error(y_val, y_val_pred))

print(tree_reg.get_depth())

MSE for decision tree regression model 1 => 3512852739.4432545
MAE for decision tree regression model 1 => 38416.6096755659
MAPE for decision tree regression model 1 => 0.06905041035848657
63


In [None]:
# train using k-nearest neighbors
# from sklearn.neighbors import KNeighborsRegressor

# knn_reg = KNeighborsRegressor(weights="distance", n_jobs=8).fit(X_train, y_train)

# y_val_pred = knn_reg.predict(X_val)
# print("MSE for k-nearst neighbors regression model 1 =>", mean_squared_error(y_val, y_val_pred))
# print("MAE for k-nearst neighbors regression model 1 =>", mean_absolute_error(y_val, y_val_pred))
# print("MAPE for k-nearst neighbors regression model 1 =>", mean_absolute_percentage_error(y_val, y_val_pred))

In [None]:
# # Train Linear support vector regressor and evaluate on validation set
# from sklearn.svm import LinearSVR

# svm_reg = LinearSVR(random_state=42).fit(X_train, y_train)

# y_val_pred = svm_reg.predict(X_val)
# print("MSE for linear support vector regression model 1 =>", mean_squared_error(y_val, y_val_pred))
# print("MAE for linear support vector regression model 1 =>", mean_absolute_error(y_val, y_val_pred))

In [13]:
# Train Random Forest Regressor and evaluate on validation set
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=8).fit(X_train, y_train)

y_val_pred = forest_reg.predict(X_val)
print("MSE for random forest regression model 1 =>", mean_squared_error(y_val, y_val_pred))
print("MAE for random forest regression model 1 =>", mean_absolute_error(y_val, y_val_pred))
print("MAPE for random forest regression model 1 =>", mean_absolute_percentage_error(y_val, y_val_pred))

MSE for random forest regression model 1 => 2092233181.4044025
MAE for random forest regression model 1 => 30038.676611209292
MAPE for random forest regression model 1 => 0.05468338536735954


In [29]:
max_depth = 0
for dt_reg in forest_reg.estimators_:
	if dt_reg.get_depth() > max_depth:
		max_depth = dt_reg.get_depth()

print(max_depth)
print(len(forest_reg.estimators_))

65
100
