# Lecture 10 (c)
In this example, we will train and test a **linear regression model** to predict tip amount from multiple predictor variables using the holdout method and cross-validation.

In [None]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [None]:
# Load dataset
data = sns.load_dataset("tips")
data.head()

In [None]:
# Partition dataset into training, validation, and test sets using holdout method
X_train, X_test, Y_train, Y_test = train_test_split(data[['total_bill', 'size', 'sex', 'smoker', 'day', 'time']], data['tip'], test_size = 0.2, random_state = 1)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 1)

In [None]:
# Explore training set
print(X_train.info())

In [None]:
# Explore validation set
print(X_val.info())

In [None]:
# Explore test set
print(X_test.info())

In [None]:
# Evaluate model with two predictors
model = linear_model.LinearRegression().fit(X = X_train[['total_bill', 'size']], y = Y_train)
score_train = model.score(X = X_train[['total_bill', 'size']], y = Y_train) # R squared (training)
score_val = model.score(X = X_val[['total_bill', 'size']], y = Y_val) # R squared (validation)
score_test = model.score(X = X_test[['total_bill', 'size']], y = Y_test) # R squared (test)
print([score_train, score_val, score_test])

In [None]:
# Evaluate model with all predictors
X_train_dummy = pd.get_dummies(X_train, drop_first = True)
X_val_dummy = pd.get_dummies(X_val, drop_first = True)
X_test_dummy = pd.get_dummies(X_test, drop_first = True)
model = linear_model.LinearRegression().fit(X = X_train_dummy, y = Y_train)
score_train = model.score(X = X_train_dummy, y = Y_train) # R squared (training)
score_val = model.score(X = X_val_dummy, y = Y_val) # R squared (validation)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train, score_val, score_test])

In [None]:
# Evaluate ridge regression model
model = linear_model.Ridge(alpha = 1).fit(X = X_train_dummy, y = Y_train)
print(model.coef_)
print(model.intercept_)
score_train = model.score(X = X_train_dummy, y = Y_train) # R squared (training)
score_val = model.score(X = X_val_dummy, y = Y_val) # R squared (validation)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train, score_val, score_test])

In [None]:
# Evaluate LASSO regression model
model = linear_model.Lasso(alpha = 1).fit(X = X_train_dummy, y = Y_train)
print(model.coef_)
print(model.intercept_)
score_train = model.score(X = X_train_dummy, y = Y_train) # R squared (training)
score_val = model.score(X = X_val_dummy, y = Y_val) # R squared (validation)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train, score_val, score_test])

In [None]:
# Evaluate elastic net model
model = linear_model.ElasticNet(alpha = 1, l1_ratio = 0.5).fit(X = X_train_dummy, y = Y_train)
print(model.coef_)
print(model.intercept_)
score_train = model.score(X = X_train_dummy, y = Y_train) # R squared (training)
score_val = model.score(X = X_val_dummy, y = Y_val) # R squared (validation)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train, score_val, score_test])

In [None]:
# Question 01: Using the holdout method, which model has the best performance?

In [None]:
# Partition dataset into training and test sets using 5-fold cross-validation
folds = KFold(n_splits = 5, shuffle = False)
for train_index, test_index in folds.split(data):
    print([train_index.shape[0], test_index.shape[0]])

In [None]:
# Evaluate model with two predictors
model = linear_model.LinearRegression()
scores = cross_val_score(model, X = data[['total_bill', 'size']], y = data['tip'], cv = folds)
print(scores)
print([scores.mean(), scores.std()])

In [None]:
# Evaluate model with all predictors
data_dummy = pd.get_dummies(data[['total_bill', 'size', 'sex', 'smoker', 'day', 'time']], drop_first = True)
model = linear_model.LinearRegression()
scores = cross_val_score(model, X = data_dummy, y = data['tip'], cv = folds)
print(scores)
print([scores.mean(), scores.std()])

In [None]:
# Evaluate ridge regression model
model = linear_model.Ridge(alpha = 1)
scores = cross_val_score(model, X = data_dummy, y = data['tip'], cv = folds)
print(scores)
print([scores.mean(), scores.std()])

In [None]:
# Evaluate LASSO regression model
model = linear_model.Lasso(alpha = 1)
scores = cross_val_score(model, X = data_dummy, y = data['tip'], cv = folds)
print(scores)
print([scores.mean(), scores.std()])

In [None]:
# Evaluate elastic net model
model = linear_model.ElasticNet(alpha = 1, l1_ratio = 0.5)
scores = cross_val_score(model, X = data_dummy, y = data['tip'], cv = folds)
print(scores)
print([scores.mean(), scores.std()])

In [None]:
# Question 02: Using the cross-validation method, which model has the best performance?