# Model 1 (predict time) 

## Data preprocessing 

In [138]:
import pandas as pd
import numpy as np

# Load dataframe
df = pd.read_csv("student-mat.csv", sep=';')

print(df.shape)
df.head()

(395, 33)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10


In [139]:
# Drop irrelevant columns 
cols = ['school', 'sex', 'age', 'address', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian',
        'paid', 'nursery', 'internet', 'famrel', 'Dalc', 'Walc', 'goout','schoolsup', 'famsup', 'higher']

df = df.drop(cols, axis=1)
df.tail()

Unnamed: 0,famsize,Pstatus,traveltime,studytime,failures,activities,romantic,freetime,health,absences,G1,G2,G3
390,LE3,A,1,2,2,no,no,5,4,11,9,9,9
391,LE3,T,2,1,0,no,no,4,2,3,14,16,16
392,GT3,T,1,1,3,no,no,5,3,3,10,8,7
393,LE3,T,3,1,0,no,no,4,5,0,11,12,10
394,LE3,T,1,1,0,no,no,2,5,5,8,9,9


In [140]:
# Check for null values
df.isnull().sum()

famsize       0
Pstatus       0
traveltime    0
studytime     0
failures      0
activities    0
romantic      0
freetime      0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   famsize     395 non-null    object
 1   Pstatus     395 non-null    object
 2   traveltime  395 non-null    int64 
 3   studytime   395 non-null    int64 
 4   failures    395 non-null    int64 
 5   activities  395 non-null    object
 6   romantic    395 non-null    object
 7   freetime    395 non-null    int64 
 8   health      395 non-null    int64 
 9   absences    395 non-null    int64 
 10  G1          395 non-null    int64 
 11  G2          395 non-null    int64 
 12  G3          395 non-null    int64 
dtypes: int64(9), object(4)
memory usage: 40.2+ KB


In [142]:
# Handle categorical values 
from sklearn.preprocessing import OneHotEncoder

binary_features = ['famsize', 'Pstatus', 'activities', 'romantic']

encoder = OneHotEncoder()
encoded_binary_features = encoder.fit_transform(df[binary_features])

encoded_binary_features.toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [143]:
cat_cols = encoder.get_feature_names(binary_features)
cat_cols

array(['famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T',
       'activities_no', 'activities_yes', 'romantic_no', 'romantic_yes'],
      dtype=object)

In [144]:
cat_df = pd.DataFrame(encoded_binary_features.toarray(), columns=cat_cols)
cat_df.head()

Unnamed: 0,famsize_GT3,famsize_LE3,Pstatus_A,Pstatus_T,activities_no,activities_yes,romantic_no,romantic_yes
0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [145]:
# Drop features
"""
Greater than 3? 
Live with your parents? 
Extra school support? 
Extra family support? 
Extra curricular activities? 
Want to take higher education? 
In a relationship? 
"""
additionalCols = ['famsize_LE3', 'Pstatus_A', 'activities_no', 'romantic_no']
cat_df.drop(additionalCols, axis=1, inplace=True)
cat_df.head()

Unnamed: 0,famsize_GT3,Pstatus_T,activities_yes,romantic_yes
0,1.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,1.0,1.0,1.0,1.0
4,1.0,1.0,0.0,0.0


In [146]:
# Merge two dataframes 
df.drop(binary_features, axis=1, inplace=True)
df.drop(['G1', 'G2'], axis=1, inplace=True)

students_df = df.join(cat_df)
students_df.head()

Unnamed: 0,traveltime,studytime,failures,freetime,health,absences,G3,famsize_GT3,Pstatus_T,activities_yes,romantic_yes
0,2,2,0,3,3,6,6,1.0,0.0,0.0,0.0
1,1,2,0,3,3,4,6,1.0,1.0,0.0,0.0
2,1,2,3,3,3,10,10,0.0,1.0,0.0,0.0
3,1,3,0,2,5,2,15,1.0,1.0,1.0,1.0
4,1,2,0,3,5,4,10,1.0,1.0,0.0,0.0


In [147]:
students_df['scores'] = (students_df['G3'] / 2)
students_df.drop(['G3'], axis=1, inplace=True)
students_df

Unnamed: 0,traveltime,studytime,failures,freetime,health,absences,famsize_GT3,Pstatus_T,activities_yes,romantic_yes,scores
0,2,2,0,3,3,6,1.0,0.0,0.0,0.0,3.0
1,1,2,0,3,3,4,1.0,1.0,0.0,0.0,3.0
2,1,2,3,3,3,10,0.0,1.0,0.0,0.0,5.0
3,1,3,0,2,5,2,1.0,1.0,1.0,1.0,7.5
4,1,2,0,3,5,4,1.0,1.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...
390,1,2,2,5,4,11,0.0,0.0,0.0,0.0,4.5
391,2,1,0,4,2,3,0.0,1.0,0.0,0.0,8.0
392,1,1,3,5,3,3,1.0,1.0,0.0,0.0,3.5
393,3,1,0,4,5,0,0.0,1.0,0.0,0.0,5.0


In [148]:
# Target features
target_features = ['studytime', 'freetime']

In [149]:
# Feature correlations
corr_matrix = students_df.corr()

for target in target_features: 
  print("Correlation with {}".format(target))
  print(corr_matrix[target].sort_values(ascending=False))
  print('\n\n')

Correlation with studytime
studytime         1.000000
scores            0.097820
activities_yes    0.089877
famsize_GT3       0.073595
romantic_yes      0.053285
Pstatus_T         0.024294
absences         -0.062700
health           -0.075616
traveltime       -0.100909
freetime         -0.143198
failures         -0.173563
Name: studytime, dtype: float64



Correlation with freetime
freetime          1.000000
failures          0.091987
activities_yes    0.089728
health            0.075733
Pstatus_T         0.038717
scores            0.011307
romantic_yes     -0.011182
traveltime       -0.017025
famsize_GT3      -0.017695
absences         -0.058078
studytime        -0.143198
Name: freetime, dtype: float64





In [150]:
from sklearn.metrics import mean_squared_error

def display_scores(scores): 
  """
  Prints Scores array, it's mean value and standard deviation 

  :param socres: Scores array returned by cross validation 
  :type scores: list
  """
  print("Scores: ", scores)
  print("Mean: ", scores.mean())
  print("STD: ", scores.std())

def root_mean_squared_error(data, prediction): 
  """
  Calculates root mean squared error of a prediction set

  :param data: Actual dataset
  :type data: numpy.ndArray

  :param prediction: Predicted values
  :type prediction: numpy.ndArray 

  :returs: Root mean squared errors for each instance
  :rtype: numpy.ndArray
  """
  return np.sqrt(mean_squared_error(data, prediction))

## Model selection 

### Linear regression 

In [151]:
# Train test split 
from sklearn.model_selection import train_test_split

X = students_df.drop(target_features, axis=1)
y = students_df[target_features]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [152]:
y_test.describe()

Unnamed: 0,studytime,freetime
count,99.0,99.0
mean,2.0,3.242424
std,0.892143,1.000927
min,1.0,1.0
25%,1.0,3.0
50%,2.0,3.0
75%,2.0,4.0
max,4.0,5.0


In [153]:
from sklearn.linear_model import LinearRegression 

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Make predictions 
lin_reg_preds = lin_reg.predict(X_test)
# Calculate RMSE 
lin_reg_preds_mse = root_mean_squared_error(y_test, lin_reg_preds)

print("RMSE: ")
lin_reg_preds_mse

RMSE: 


0.942479474770604

### Random Forest regressor

In [154]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': np.arange(50, 150, 10), 
    'max_features': [2, 4, 6, 8], 
    'max_depth': np.arange(1, 11, 1)
}

forest_reg = RandomForestRegressor() 

grid_search = GridSearchCV(
    forest_reg, param_grid, cv = 5, 
    scoring = 'neg_mean_squared_error'
)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [155]:
grid_search.best_params_ 

{'max_depth': 1, 'max_features': 8, 'n_estimators': 80}

In [156]:
best_estimator = grid_search.best_estimator_

random_forest_predictions = best_estimator.predict(X_test)
print(root_mean_squared_error(y_test, random_forest_predictions))

0.94728393619429


### Elastic net regression

In [157]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0, 0.2, 0.4, 0.6, 0.8, 1], 
    'normalize': [True, False], 
}

elasticNet = ElasticNet() 

grid_search = GridSearchCV(
    elasticNet, param_grid, cv = 5, 
    scoring = 'neg_mean_squared_error'
)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.5, max_iter=1000, normalize=False,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0, 0.2, 0.4, 0.6, 0.8, 1],
                         'normalize': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [158]:
grid_search.best_params_

{'alpha': 0.2, 'normalize': True}

In [159]:
elasticNet_preds = grid_search.best_estimator_.predict(X_test)
print(root_mean_squared_error(y_test, elasticNet_preds))

0.9439115821302053


## Export the best model 


In [160]:
from sklearn.ensemble import RandomForestRegressor
import joblib 

best_model = grid_search.best_estimator_

filename = "best_model_1.pkl"

joblib.dump(best_model, filename)

['best_model_1.pkl']

# Model 2 (predict grade) 

## Data preprocessing

In [175]:
# Load dataframe
df = pd.read_csv("student-mat.csv", sep=';')

print(df.shape)
df.head()

(395, 33)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10


In [176]:
cols = ['school', 'sex', 'age', 'address', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian',
        'paid', 'nursery', 'internet', 'famrel', 'Dalc', 'Walc', 'goout','schoolsup', 'famsup', 'higher',
        'activities', 'traveltime', 'famsize', 'Pstatus', 'health']

df = df.drop(cols, axis=1)
df.tail()

Unnamed: 0,studytime,failures,romantic,freetime,absences,G1,G2,G3
390,2,2,no,5,11,9,9,9
391,1,0,no,4,3,14,16,16
392,1,3,no,5,3,10,8,7
393,1,0,no,4,0,11,12,10
394,1,0,no,2,5,8,9,9


In [172]:
df.dtypes

studytime     int64
failures      int64
romantic     object
freetime      int64
absences      int64
G1            int64
G2            int64
G3            int64
dtype: object

In [177]:
binary_features = ['romantic']

encoder = OneHotEncoder()
encoded_binary_features = encoder.fit_transform(df[binary_features])

cat_cols = encoder.get_feature_names(binary_features)

cat_df = pd.DataFrame(encoded_binary_features.toarray(), columns=cat_cols)

additionalCols = ['romantic_no']
cat_df.drop(additionalCols, axis=1, inplace=True)

df.drop(binary_features, axis=1, inplace=True)
df.drop(['G1', 'G2'], axis=1, inplace=True)

students_df = df.join(cat_df)

students_df['score'] = students_df['G3'] / 2
students_df.drop(['G3'], axis=1, inplace=True)
students_df.head()

Unnamed: 0,studytime,failures,freetime,absences,romantic_yes,score
0,2,0,3,6,0.0,3.0
1,2,0,3,4,0.0,3.0
2,2,3,3,10,0.0,5.0
3,3,0,2,2,1.0,7.5
4,2,0,3,4,0.0,5.0


## Model selection 

In [180]:
target_features = ['score']

X = students_df.drop(target_features, axis=1)
y = students_df[target_features]

X_train, X_test, y_train, y_test = train_test_split(X, y)

### Linear regression

In [181]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Make predictions 
lin_reg_preds = lin_reg.predict(X_test)
# Calculate RMSE 
lin_reg_preds_mse = root_mean_squared_error(y_test, lin_reg_preds)

print("RMSE: ")
lin_reg_preds_mse

RMSE: 


2.22360744251324

In [182]:
param_grid = {
    'n_estimators': np.arange(50, 150, 10), 
    'max_features': [2, 4, 6, 8], 
    'max_depth': np.arange(1, 11, 1)
}

forest_reg = RandomForestRegressor() 

grid_search = GridSearchCV(
    forest_reg, param_grid, cv = 5, 
    scoring = 'neg_mean_squared_error'
)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [183]:
grid_search.best_params_ 

{'max_depth': 5, 'max_features': 2, 'n_estimators': 80}

In [184]:
best_estimator = grid_search.best_estimator_

random_forest_predictions = best_estimator.predict(X_test)
print(root_mean_squared_error(y_test, random_forest_predictions))

2.080246884360478


## Export the best model 

In [185]:
best_model = grid_search.best_estimator_

filename = "best_model_2.pkl"

joblib.dump(best_model, filename)

['best_model_2.pkl']