In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

In [2]:
# ID feature is already dropped from cleaning notebook
# Define the paths for all source files
cwd = os.getcwd()
orig_test_data_path = os.path.join(cwd,'wine_data','test.csv')
orig_train_data_path = os.path.join(cwd,'wine_data','train.csv')

# Open all source files as a pd dataframe
orig_test = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/test.csv')
orig_train = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/train.csv')
red_data = pd.read_csv('red_data_noID.csv')
white_data = pd.read_csv('white_data_noID.csv')
red_white_data = pd.read_csv('red_white_data_noID.csv')
trainable_data = pd.read_csv('trainable_data_noID.csv')

# Drop 'ID' columns
orig_test = orig_test.drop(columns='Id')
orig_train = orig_train.drop(columns='Id')

In [3]:
# Split the data into training and testing sets
X_train_all = trainable_data.drop('quality', axis=1)
y_train_all = trainable_data['quality'].copy()
X_test_all = trainable_data.copy()
# y_test_all -= 3
X_train_all, X_val_all, y_train_all, y_val_all = train_test_split(X_train_all, 
                                                  y_train_all, 
                                                  test_size = 0.2, 
                                                  random_state = 42) #because it is the answer :)

In [4]:
from warnings import filterwarnings
filterwarnings('ignore')

# Create a Gradient Boosting Regressor
all_model = GradientBoostingClassifier()

# Define the hyperparameter grid to search
all_param_grid = {
    'n_estimators': [50, 100, 200], # number of weak learning trees
    'learning_rate': [0.01, 0.1, 0.2], # control possibility of overfitting.
    # Strong interaction with n_estimators: smaller learning rates require more n_estimators
    'max_depth': [3, 5, 7] #  size of each tree
}

# Perform grid search with cross-validation
all_grid_search = GridSearchCV(all_model, all_param_grid, cv=5, verbose=3)
all_grid_search.fit(X_train_all, y_train_all)

# Get the best model from grid search
all_best_gb = all_grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred_all = all_best_gb.predict(X_val_all)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.519 total time=   3.3s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.541 total time=   3.2s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.535 total time=   3.2s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.548 total time=   3.2s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.536 total time=   3.2s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.530 total time=   6.3s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.560 total time=   6.6s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.543 total time=   6.8s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.556 total time=   6.5s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.550 to

In [5]:
# Calculate metrics
all_accuracy = accuracy_score(y_val_all, y_pred_all)
# the following metrics need to include the average type since the validation is ordinal and not binary
all_f1 = f1_score(y_val_all, y_pred_all, average= 'weighted')
all_precision = precision_score(y_val_all, y_pred_all, average= 'weighted')
all_recall = recall_score(y_val_all, y_pred_all, average= 'weighted')

print(f"Best Estimators: {all_best_gb.n_estimators}")
print(f"Best Learning Rate: {all_best_gb.learning_rate}")
print(f"Best Max Depth: {all_best_gb.max_depth}")
print(f"Accuracy: {all_accuracy:.2f}")
print(f"F1: {all_f1:.2f}")
print(f"Precision: {all_precision:.2f}")
print(f"Recall: {all_recall:.2f}")

Best Estimators: 200
Best Learning Rate: 0.1
Best Max Depth: 7
Accuracy: 0.65
F1: 0.64
Precision: 0.65
Recall: 0.65


In [6]:
# Create a Gradient Boosting Regressor
all_model_2 = GradientBoostingClassifier()

# Define the hyperparameter grid to search
all_param_grid_2 = {
    'n_estimators': [100, 200, 400], # number of weak learning trees
    'learning_rate': [0.01, 0.1, 0.2], # control possibility of overfitting.
    # Strong interaction with n_estimators: smaller learning rates require more n_estimators
    'max_depth': [5, 7, 9] #  size of each tree
}

# Perform grid search with cross-validation
all_grid_search_2 = GridSearchCV(all_model_2, all_param_grid_2, cv=5, verbose=3)
all_grid_search_2.fit(X_train_all, y_train_all)

# Get the best model from grid search
all_best_gb_2 = all_grid_search_2.best_estimator_

# Make predictions on the test data using the best model
y_pred_all_2 = all_best_gb_2.predict(X_val_all)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.554 total time=  10.2s
[CV 2/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.581 total time=  10.4s
[CV 3/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.567 total time=  10.7s
[CV 4/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.575 total time=  11.0s
[CV 5/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.580 total time=  10.2s
[CV 1/5] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.571 total time=  20.9s
[CV 2/5] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.607 total time=  20.9s
[CV 3/5] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.585 total time=  21.4s
[CV 4/5] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.586 total time=  21.2s
[CV 5/5] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.5

In [7]:
# Calculate metrics
all_accuracy_2 = accuracy_score(y_val_all, y_pred_all_2)
# the following metrics need to include the average type since the validation is ordinal and not binary
all_f1_2 = f1_score(y_val_all, y_pred_all_2, average= 'weighted')
all_precision_2 = precision_score(y_val_all, y_pred_all_2, average= 'weighted')
all_recall_2 = recall_score(y_val_all, y_pred_all_2, average= 'weighted')

print(f"Best Estimators: {all_best_gb_2.n_estimators}")
print(f"Best Learning Rate: {all_best_gb_2.learning_rate}")
print(f"Best Max Depth: {all_best_gb_2.max_depth}")
print(f"Accuracy: {all_accuracy_2:.2f}")
print(f"F1: {all_f1_2:.2f}")
print(f"Precision: {all_precision_2:.2f}")
print(f"Recall: {all_recall_2:.2f}")

Best Estimators: 400
Best Learning Rate: 0.01
Best Max Depth: 9
Accuracy: 0.66
F1: 0.65
Precision: 0.66
Recall: 0.66


In [8]:
all_best_hp = all_grid_search.best_params_
print(f"Best hp for 1st model: {all_best_hp}")
print("---------------------------------")
print("Confusion Matrix:")
display(confusion_matrix(y_val_all, y_pred_all))
print("---------------------------------")
print(classification_report(y_val_all,y_pred_all))

Best hp for 1st model: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
---------------------------------
Confusion Matrix:


array([[  0,   0,   4,   1,   0,   0,   0],
       [  3,   7,  30,  14,   1,   0,   0],
       [  4,   3, 418, 150,   7,   0,   0],
       [  2,   2, 154, 529,  49,   1,   0],
       [  0,   0,  15, 129, 131,   3,   1],
       [  0,   0,   1,  20,  12,  20,   0],
       [  0,   0,   0,   0,   0,   0,   0]])

---------------------------------
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.58      0.13      0.21        55
           5       0.67      0.72      0.69       582
           6       0.63      0.72      0.67       737
           7       0.66      0.47      0.55       279
           8       0.83      0.38      0.52        53
           9       0.00      0.00      0.00         0

    accuracy                           0.65      1711
   macro avg       0.48      0.34      0.38      1711
weighted avg       0.65      0.65      0.64      1711



In [9]:
all_best_hp_2 = all_grid_search_2.best_params_
print(f"Best hp for 2nd model: {all_best_hp_2}")
print("---------------------------------")
print("Confusion Matrix:")
display(confusion_matrix(y_val_all, y_pred_all_2))
print("---------------------------------")
print(classification_report(y_val_all,y_pred_all_2))

Best hp for 2nd model: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 400}
---------------------------------
Confusion Matrix:


array([[  0,   0,   3,   2,   0,   0,   0],
       [  0,   8,  32,  15,   0,   0,   0],
       [  1,   3, 430, 142,   6,   0,   0],
       [  0,   2, 151, 527,  55,   2,   0],
       [  0,   0,  11, 126, 138,   3,   1],
       [  0,   0,   1,  18,  13,  21,   0],
       [  0,   0,   0,   0,   0,   0,   0]])

---------------------------------
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.62      0.15      0.24        55
           5       0.68      0.74      0.71       582
           6       0.63      0.72      0.67       737
           7       0.65      0.49      0.56       279
           8       0.81      0.40      0.53        53
           9       0.00      0.00      0.00         0

    accuracy                           0.66      1711
   macro avg       0.48      0.36      0.39      1711
weighted avg       0.66      0.66      0.65      1711

