In [None]:
import pandas as pd

# Load dataframe
df = pd.read_csv("student-mat.csv", sep=';')

print(df.shape)

df.head()

(395, 33)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10


In [None]:
from sklearn.model_selection import train_test_split

X = df[['studytime', 'failures', 'absences']]
y = df['G3'] / 2

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Find best fitting RandomForestRegressor model 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
warnings.filterwarnings('ignore')

param_grid = {
    'n_estimators': np.arange(50, 150, 10), 
    'max_features': np.arange(1, 10, 1), 
    'max_depth': np.arange(1, 11, 1)
}

forest_reg = RandomForestRegressor() 

grid_search = GridSearchCV(
    forest_reg, param_grid, cv = 5, 
    scoring = 'neg_mean_squared_error'
)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [None]:
# Get the best estimator's parameters
grid_search.best_params_ 

{'max_depth': 2, 'max_features': 2, 'n_estimators': 100}

In [None]:
from sklearn.metrics import mean_squared_error

def root_mean_squared_error(data, prediction): 
  """
  Calculates root mean squared error of a prediction set

  :param data: Actual dataset
  :type data: numpy.ndArray

  :param prediction: Predicted values
  :type prediction: numpy.ndArray 

  :returs: Root mean squared errors for each instance
  :rtype: numpy.ndArray
  """
  return np.sqrt(mean_squared_error(data, prediction))

In [None]:
# Get the accuracy of the best model 

best_estimator = grid_search.best_estimator_

random_forest_predictions = best_estimator.predict(X_test)
print(root_mean_squared_error(y_test, random_forest_predictions))

1.7281460013865346


In [None]:
import joblib

# Export the best model 

best_model = grid_search.best_estimator_

filename = "por_grade_pred.pkl"

joblib.dump(best_model, filename)

['por_grade_pred.pkl']

## Gradient Boosting Tree

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth = 2, warm_start = True)

min_val_error = float("inf")
error_going_up = 0 
n = 5

# Try up to 120 estimators 
for n_estimators in range(1, 120): 
  gbrt.n_estimators = n_estimators
  gbrt.fit(X_train, y_train)
  y_pred = gbrt.predict(X_val)
  val_error = mean_squared_error(y_val, y_pred)

  if val_error < min_val_error: 
    min_val_error = val_error
    error_going_up = 0 
  else: 
    error_going_up += 1
    if error_going_up == n: 
      # Early stopping
      break

print(gbrt.n_estimators)
print("Minimum validation MSE:", min_val_error)

30
Minimum validation MSE: 4.628900993196632
