In [1]:
import sklearn
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LogisticRegression, LinearRegression
import pandas as pd 
import numpy as np 
import matplotlib as plt
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score

In [2]:
# Loading data

data = pd.read_csv("Data/student-merge.csv")
data = pd.get_dummies(data, drop_first=True)
y = data["Walc"]
X = data.drop(["Dalc","Walc"],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.1)

In [3]:
feature_names = [atts for atts in X]
class_names = ["1","2","3","4","5"]

In [4]:
# Fitting several regression models to predict y

lm = LinearRegression()
lm_fit = lm.fit(X_train, y_train)

ridge = Ridge()
ridge_fit = ridge.fit(X_train,y_train)

lasso = Lasso(max_iter=1000)
lasso_fit = lasso.fit(X_train,y_train)

ENet = ElasticNet(l1_ratio=.01)
ENet_fit = ENet.fit(X_train,y_train)

preds = [[lm_fit, 'Linear Regression'], [ridge_fit, 'Ridge'], [lasso_fit, 'Lasso'], [ENet_fit, 'ElasticNet']]

# Comparing predicted y to the actual y and calculating the average

default_results = []

for p in preds:
    correct = 0
    print("===== {} =====".format(p[1]))
    print()
    for i in range(len(X_test)):
        a = p[0].predict([X_test.iloc[i]])
        msg = ""
        # The predicted values are rounded since they need to be either 1, 2, 3, 4 or 5
        if y_test.values[i] == np.around(a[0]):
            msg = "=====CORRECT====="
            correct += 1
        print("Actual= {} - Predict= {} {}".format(y_test.values[i],round(a[0],2), msg))
    print()
    print("{} had an accuracy of {}".format(p[1], correct/len(X_test)))
    default_results.append([p[1], correct/len(X_test)])
    print()
        


===== Linear Regression =====

Actual= 4 - Predict= 1.99 
Actual= 1 - Predict= 2.72 
Actual= 1 - Predict= 1.16 =====CORRECT=====
Actual= 1 - Predict= 2.34 
Actual= 2 - Predict= 2.56 
Actual= 2 - Predict= 1.69 =====CORRECT=====
Actual= 3 - Predict= 1.99 
Actual= 3 - Predict= 2.21 
Actual= 2 - Predict= 1.43 
Actual= 2 - Predict= 2.05 =====CORRECT=====
Actual= 1 - Predict= 2.36 
Actual= 4 - Predict= 3.58 =====CORRECT=====
Actual= 3 - Predict= 2.42 
Actual= 1 - Predict= 1.42 =====CORRECT=====
Actual= 2 - Predict= 2.21 =====CORRECT=====
Actual= 2 - Predict= 2.25 =====CORRECT=====
Actual= 1 - Predict= 2.19 
Actual= 4 - Predict= 3.08 
Actual= 2 - Predict= 2.92 
Actual= 2 - Predict= 2.05 =====CORRECT=====
Actual= 3 - Predict= 2.06 
Actual= 4 - Predict= 3.33 
Actual= 2 - Predict= 2.62 
Actual= 2 - Predict= 1.37 
Actual= 2 - Predict= 1.55 =====CORRECT=====
Actual= 1 - Predict= 1.01 =====CORRECT=====
Actual= 1 - Predict= 2.74 
Actual= 5 - Predict= 2.39 
Actual= 4 - Predict= 2.79 
Actual= 1 - Pred

In [5]:
# Finding optimal parameters for every method

# Linear Regression

grid_params = {
    'fit_intercept': ['True', 'False', 'optional']
}

gs_1 = GridSearchCV(LinearRegression(), grid_params)
gs_1 = gs_1.fit(X, y)
print("Best hyperparameter: fit_intercept: {}\n".format(gs_1.best_params_.get('fit_intercept')))

# Ridge

grid_params = {
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg']
}

gs_1 = GridSearchCV(Ridge(), grid_params)
gs_1 = gs_1.fit(X, y)
print("Best hyperparameter: solver: {}\n".format(gs_1.best_params_.get('solver')))

# Lasso

grid_params = {
    'normalize': [True, False, 'optional'] 
}

gs_1 = GridSearchCV(Lasso(), grid_params)
gs_1 = gs_1.fit(X, y)
print("Best hyperparameter: normalize: {}\n".format(gs_1.best_params_.get('normalize')))

# ElasticNet

grid_params = {
    'normalize': [True, False, 'optional'] 
}

gs_1 = GridSearchCV(ElasticNet(), grid_params)
gs_1 = gs_1.fit(X, y)
print("Best hyperparameter: normalize: {}\n".format(gs_1.best_params_.get('normalize')))


Best hyperparameter: fit_intercept: True

Best hyperparameter: solver: lsqr

Best hyperparameter: normalize: False

Best hyperparameter: normalize: False



In [6]:
# Predictions with optimal parameters

lm = LinearRegression()
lm_fit = lm.fit(X_train, y_train)

ridge = Ridge(solver='lsqr')
ridge_fit = ridge.fit(X_train,y_train)

lasso = Lasso(max_iter=1000, normalize=False)
lasso_fit = lasso.fit(X_train,y_train)

ENet = ElasticNet(l1_ratio=.01, normalize=False)
ENet_fit = ENet.fit(X_train,y_train)

preds = [[lm_fit, 'Linear Regression'], [ridge_fit, 'Ridge'], [lasso_fit, 'Lasso'], [ENet_fit, 'ElasticNet']]


# Comparing predicted y to the actual y and calculating the average

param_results  = []

for p in preds:
    correct = 0
    print("===== {} =====".format(p[1]))
    print()
    for i in range(len(X_test)):
        a = p[0].predict([X_test.iloc[i]])
        msg = ""
        # The predicted values are rounded since they need to be either 1, 2, 3, 4 or 5
        if y_test.values[i] == np.around(a[0]):
            msg = "=====CORRECT====="
            correct += 1
        print("Actual= {} - Predict= {} {}".format(y_test.values[i],round(a[0],2), msg))
    print()
    print("{} had an accuracy of {}".format(p[1], correct/len(X_test)))
    param_results.append([p[1], correct/len(X_test)])
    print()
    

===== Linear Regression =====

Actual= 4 - Predict= 1.99 
Actual= 1 - Predict= 2.72 
Actual= 1 - Predict= 1.16 =====CORRECT=====
Actual= 1 - Predict= 2.34 
Actual= 2 - Predict= 2.56 
Actual= 2 - Predict= 1.69 =====CORRECT=====
Actual= 3 - Predict= 1.99 
Actual= 3 - Predict= 2.21 
Actual= 2 - Predict= 1.43 
Actual= 2 - Predict= 2.05 =====CORRECT=====
Actual= 1 - Predict= 2.36 
Actual= 4 - Predict= 3.58 =====CORRECT=====
Actual= 3 - Predict= 2.42 
Actual= 1 - Predict= 1.42 =====CORRECT=====
Actual= 2 - Predict= 2.21 =====CORRECT=====
Actual= 2 - Predict= 2.25 =====CORRECT=====
Actual= 1 - Predict= 2.19 
Actual= 4 - Predict= 3.08 
Actual= 2 - Predict= 2.92 
Actual= 2 - Predict= 2.05 =====CORRECT=====
Actual= 3 - Predict= 2.06 
Actual= 4 - Predict= 3.33 
Actual= 2 - Predict= 2.62 
Actual= 2 - Predict= 1.37 
Actual= 2 - Predict= 1.55 =====CORRECT=====
Actual= 1 - Predict= 1.01 =====CORRECT=====
Actual= 1 - Predict= 2.74 
Actual= 5 - Predict= 2.39 
Actual= 4 - Predict= 2.79 
Actual= 1 - Pred

Actual= 3 - Predict= 2.48 
Actual= 4 - Predict= 2.12 
Actual= 4 - Predict= 3.04 
Actual= 1 - Predict= 2.13 
Actual= 1 - Predict= 2.02 
Actual= 3 - Predict= 2.71 =====CORRECT=====
Actual= 3 - Predict= 3.09 =====CORRECT=====
Actual= 1 - Predict= 1.91 
Actual= 2 - Predict= 2.26 =====CORRECT=====
Actual= 2 - Predict= 1.96 =====CORRECT=====
Actual= 1 - Predict= 2.47 
Actual= 2 - Predict= 2.48 =====CORRECT=====
Actual= 3 - Predict= 2.38 
Actual= 1 - Predict= 2.18 
Actual= 2 - Predict= 2.2 =====CORRECT=====
Actual= 1 - Predict= 2.58 
Actual= 2 - Predict= 2.27 =====CORRECT=====
Actual= 3 - Predict= 2.12 

ElasticNet had an accuracy of 0.2761904761904762



In [7]:
# Comparing results

for a, b in zip(default_results, param_results):
    print("Default parameters  -  " + str(a))
    print("Tuned parameters  -  " + str(b))
    print()


Default parameters  -  ['Linear Regression', 0.3142857142857143]
Tuned parameters  -  ['Linear Regression', 0.3142857142857143]

Default parameters  -  ['Ridge', 0.3142857142857143]
Tuned parameters  -  ['Ridge', 0.3333333333333333]

Default parameters  -  ['Lasso', 0.29523809523809524]
Tuned parameters  -  ['Lasso', 0.29523809523809524]

Default parameters  -  ['ElasticNet', 0.2761904761904762]
Tuned parameters  -  ['ElasticNet', 0.2761904761904762]



In [8]:
# Using only the ten most important features as explained in the EDA

best_features = ['Dalc', 'absences', 'goout','sex_M','sex_F','failures','schoolsup_yes','studytime','G1','G3']
drop = []

for f in feature_names:
    if f not in best_features:
        drop.append(f)
        
drop.append('Walc')

X = data.drop(drop,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.1)

# Predicting labels again

lm = LinearRegression()
lm_fit = lm.fit(X_train, y_train)

ridge = Ridge(solver='lsqr')
ridge_fit = ridge.fit(X_train,y_train)

lasso = Lasso(max_iter=1000)
lasso_fit = lasso.fit(X_train,y_train)

ENet = ElasticNet(l1_ratio=.01)
ENet_fit = ENet.fit(X_train,y_train)

preds = [[lm_fit, 'Linear Regression'], [ridge_fit, 'Ridge'], [lasso_fit, 'Lasso'], [ENet_fit, 'ElasticNet']]


# Comparing predicted y to the actual y and calculating the average

best_features_results  = []

for p in preds:
    correct = 0
    print("===== {} =====".format(p[1]))
    print()
    for i in range(len(X_test)):
        a = p[0].predict([X_test.iloc[i]])
        msg = ""
        # The predicted values are rounded since they need to be either 1, 2, 3, 4 or 5
        if y_test.values[i] == np.around(a[0]):
            msg = "=====CORRECT====="
            correct += 1
        print("Actual= {} - Predict= {} {}".format(y_test.values[i],round(a[0],2), msg))
    print()
    print("{} had an accuracy of {}".format(p[1], correct/len(X_test)))
    best_features_results.append([p[1], correct/len(X_test)])
    print()
    

===== Linear Regression =====

Actual= 1 - Predict= 1.19 =====CORRECT=====
Actual= 2 - Predict= 1.41 
Actual= 2 - Predict= 2.8 
Actual= 2 - Predict= 1.54 =====CORRECT=====
Actual= 1 - Predict= 1.86 
Actual= 1 - Predict= 1.98 
Actual= 2 - Predict= 1.83 =====CORRECT=====
Actual= 4 - Predict= 2.75 
Actual= 1 - Predict= 0.83 =====CORRECT=====
Actual= 3 - Predict= 2.48 
Actual= 2 - Predict= 2.09 =====CORRECT=====
Actual= 2 - Predict= 2.04 =====CORRECT=====
Actual= 3 - Predict= 2.26 
Actual= 3 - Predict= 2.2 
Actual= 1 - Predict= 1.89 
Actual= 4 - Predict= 4.21 =====CORRECT=====
Actual= 4 - Predict= 2.16 
Actual= 4 - Predict= 3.86 =====CORRECT=====
Actual= 1 - Predict= 1.48 =====CORRECT=====
Actual= 1 - Predict= 1.55 
Actual= 2 - Predict= 2.58 
Actual= 4 - Predict= 2.98 
Actual= 3 - Predict= 2.15 
Actual= 2 - Predict= 2.3 =====CORRECT=====
Actual= 3 - Predict= 1.41 
Actual= 3 - Predict= 1.93 
Actual= 4 - Predict= 2.2 
Actual= 5 - Predict= 3.97 
Actual= 3 - Predict= 2.85 =====CORRECT=====
Act

Actual= 1 - Predict= 2.27 
Actual= 2 - Predict= 2.27 =====CORRECT=====
Actual= 2 - Predict= 2.27 =====CORRECT=====
Actual= 4 - Predict= 2.27 
Actual= 4 - Predict= 2.27 
Actual= 3 - Predict= 2.27 
Actual= 1 - Predict= 2.28 
Actual= 2 - Predict= 2.3 =====CORRECT=====
Actual= 1 - Predict= 2.27 
Actual= 3 - Predict= 2.27 
Actual= 3 - Predict= 2.3 
Actual= 5 - Predict= 2.27 
Actual= 5 - Predict= 2.31 
Actual= 3 - Predict= 2.28 
Actual= 1 - Predict= 2.28 
Actual= 3 - Predict= 2.28 
Actual= 1 - Predict= 2.27 
Actual= 3 - Predict= 2.31 
Actual= 1 - Predict= 2.27 
Actual= 2 - Predict= 2.3 =====CORRECT=====
Actual= 1 - Predict= 2.27 
Actual= 3 - Predict= 2.3 
Actual= 4 - Predict= 2.28 
Actual= 1 - Predict= 2.27 
Actual= 5 - Predict= 2.27 
Actual= 4 - Predict= 2.27 
Actual= 3 - Predict= 2.29 
Actual= 1 - Predict= 2.27 
Actual= 2 - Predict= 2.28 =====CORRECT=====
Actual= 1 - Predict= 2.3 
Actual= 1 - Predict= 2.27 
Actual= 1 - Predict= 2.28 
Actual= 4 - Predict= 2.31 
Actual= 2 - Predict= 2.28 ===

In [9]:
# Comparing results

for a, b, c in zip(default_results, param_results, best_features_results):
    print("Default parameters  -  " + str(a))
    print("Tuned parameters  -  " + str(b))
    print("Selected features  -  " + str(c))
    print()

# R2 scores
for p in preds:
    r2 = r2_score(y_test.values, p[0].predict(X_test))
    print("{} R2 score: {}".format(p[1], r2))


Default parameters  -  ['Linear Regression', 0.3142857142857143]
Tuned parameters  -  ['Linear Regression', 0.3142857142857143]
Selected features  -  ['Linear Regression', 0.3523809523809524]

Default parameters  -  ['Ridge', 0.3142857142857143]
Tuned parameters  -  ['Ridge', 0.3333333333333333]
Selected features  -  ['Ridge', 0.3523809523809524]

Default parameters  -  ['Lasso', 0.29523809523809524]
Tuned parameters  -  ['Lasso', 0.29523809523809524]
Selected features  -  ['Lasso', 0.21904761904761905]

Default parameters  -  ['ElasticNet', 0.2761904761904762]
Tuned parameters  -  ['ElasticNet', 0.2761904761904762]
Selected features  -  ['ElasticNet', 0.24761904761904763]

Linear Regression R2 score: 0.34039052813208215
Ridge R2 score: 0.339671119320317
Lasso R2 score: 0.00268589465407576
ElasticNet R2 score: 0.2672914264725539


In [10]:
# Creating a confusion matrix

pred_models = [[lm, 'Linear Regression'], [ridge, 'Ridge'], [lasso, 'Lasso'], [ENet, 'ElasticNet']]

def plot_heatmap(model):
    predict = model[0].predict(X_test)
    
    cm = confusion_matrix(y_test.values, np.around(predict))

    df_cm = pd.DataFrame(cm, index = [i for i in class_names],
                      columns = [i for i in class_names])

    confmat = sns.heatmap(df_cm, annot=True)
    
    confmat.set_title('Confusion matrix of {} model'.format(model[1]))
    
    
plot_heatmap(pred_models[0])

ValueError: Shape of passed values is (5, 5), indices imply (6, 6)

In [None]:
plot_heatmap(pred_models[1])

In [None]:
plot_heatmap(pred_models[2])

In [None]:
plot_heatmap(pred_models[3])