In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

In [2]:
# ID feature is already dropped from cleaning notebook
# Define the paths for all source files
cwd = os.getcwd()
orig_test_data_path = os.path.join(cwd,'wine_data','test.csv')
orig_train_data_path = os.path.join(cwd,'wine_data','train.csv')

# Open all source files as a pd dataframe
orig_test = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/test.csv')
orig_train = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/train.csv')
red_data = pd.read_csv('red_data_noID.csv')
white_data = pd.read_csv('white_data_noID.csv')
red_white_data = pd.read_csv('red_white_data_noID.csv')
trainable_data = pd.read_csv('trainable_data_noID.csv')

# Drop 'ID' columns
orig_test = orig_test.drop(columns='Id')
orig_train = orig_train.drop(columns='Id')

In [3]:
# Split the data into training and testing sets
X_train_white = white_data.drop('quality', axis=1)
y_train_white = white_data['quality'].copy()
X_test_white = white_data.copy()
# y_test_white -= 3
X_train_white, X_val_white, y_train_white, y_val_white = train_test_split(X_train_white, 
                                                  y_train_white, 
                                                  test_size = 0.2, 
                                                  random_state = 42) #because it is the answer :)

In [4]:
from warnings import filterwarnings
filterwarnings('ignore')

# Create a Gradient Boosting Regressor
white_model = GradientBoostingClassifier()

# Define the hyperparameter grid to search
white_param_grid = {
    'n_estimators': [50, 100, 200], # number of weak learning trees
    'learning_rate': [0.01, 0.1, 0.2], # control possibility of overfitting.
    # Strong interaction with n_estimators: smaller learning rates require more n_estimators
    'max_depth': [3, 5, 7] #  size of each tree
}

# Perform grid search with cross-validation
white_grid_search = GridSearchCV(white_model, white_param_grid, cv=5, verbose=3)
white_grid_search.fit(X_train_white, y_train_white)

# Get the best model from grid search
white_best_gb = white_grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred_white = white_best_gb.predict(X_val_white)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.533 total time=   2.1s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.519 total time=   2.0s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.538 total time=   1.9s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.519 total time=   1.9s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.513 total time=   1.9s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.551 total time=   3.8s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.529 total time=   3.8s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.560 total time=   3.9s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.554 total time=   4.0s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.527 to

In [5]:
# Calculate metrics
white_accuracy = accuracy_score(y_val_white, y_pred_white)
# the following metrics need to include the average type since the validation is ordinal and not binary
white_f1 = f1_score(y_val_white, y_pred_white, average= 'weighted')
white_precision = precision_score(y_val_white, y_pred_white, average= 'weighted')
white_recall = recall_score(y_val_white, y_pred_white, average= 'weighted')

print(f"Best Estimators: {white_best_gb.n_estimators}")
print(f"Best Learning Rate: {white_best_gb.learning_rate}")
print(f"Best Max Depth: {white_best_gb.max_depth}")
print(f"Accuracy: {white_accuracy:.2f}")
print(f"F1: {white_f1:.2f}")
print(f"Precision: {white_precision:.2f}")
print(f"Recall: {white_recall:.2f}")

Best Estimators: 100
Best Learning Rate: 0.2
Best Max Depth: 5
Accuracy: 0.67
F1: 0.66
Precision: 0.67
Recall: 0.67


In [7]:
# Create a Gradient Boosting Regressor
white_model_2 = GradientBoostingClassifier()

# Define the hyperparameter grid to search
white_param_grid_2 = {
    'n_estimators': [50, 100, 200], # number of weak learning trees
    'learning_rate': [0.1, 0.2, 0.4], # control possibility of overfitting.
    # Strong interaction with n_estimators: smaller learning rates require more n_estimators
    'max_depth': [3, 5, 7] #  size of each tree
}

# Perform grid search with cross-validation
white_grid_search_2 = GridSearchCV(white_model_2, white_param_grid_2, cv=5, verbose=3)
white_grid_search_2.fit(X_train_white, y_train_white)

# Get the best model from grid search
white_best_gb_2 = white_grid_search_2.best_estimator_

# Make predictions on the test data using the best model
y_pred_white_2 = white_best_gb_2.predict(X_val_white)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.571 total time=   2.0s
[CV 2/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.585 total time=   1.9s
[CV 3/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.597 total time=   1.9s
[CV 4/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.593 total time=   1.9s
[CV 5/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.561 total time=   1.9s
[CV 1/5] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.587 total time=   4.0s
[CV 2/5] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.593 total time=   3.8s
[CV 3/5] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.597 total time=   3.8s
[CV 4/5] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.613 total time=   3.9s
[CV 5/5] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.581 total time= 

In [8]:
# Calculate metrics
white_accuracy_2 = accuracy_score(y_val_white, y_pred_white_2)
# the following metrics need to include the average type since the validation is ordinal and not binary
white_f1_2 = f1_score(y_val_white, y_pred_white_2, average= 'weighted')
white_precision_2 = precision_score(y_val_white, y_pred_white_2, average= 'weighted')
white_recall_2 = recall_score(y_val_white, y_pred_white_2, average= 'weighted')

print(f"Best Estimators: {white_best_gb_2.n_estimators}")
print(f"Best Learning Rate: {white_best_gb_2.learning_rate}")
print(f"Best Max Depth: {white_best_gb_2.max_depth}")
print(f"Accuracy: {white_accuracy_2:.2f}")
print(f"F1: {white_f1_2:.2f}")
print(f"Precision: {white_precision_2:.2f}")
print(f"Recall: {white_recall_2:.2f}")

Best Estimators: 200
Best Learning Rate: 0.2
Best Max Depth: 7
Accuracy: 0.68
F1: 0.68
Precision: 0.69
Recall: 0.68


In [9]:
white_best_hp = white_grid_search.best_params_
print(f"Best hp for 1st model: {white_best_hp}")
print("---------------------------------")
print("Confusion Matrix:")
display(confusion_matrix(y_val_white, y_pred_white))
print("---------------------------------")
print(classification_report(y_val_white,y_pred_white))

Best hp for 1st model: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100}
---------------------------------
Confusion Matrix:


array([[  1,   0,   3,   1,   0,   0],
       [  1,   7,  11,   6,   0,   0],
       [  0,   5, 198,  84,   3,   1],
       [  2,   1,  66, 325,  34,   4],
       [  0,   1,   5,  72, 109,   5],
       [  0,   0,   1,   9,  10,  15]])

---------------------------------
              precision    recall  f1-score   support

           3       0.25      0.20      0.22         5
           4       0.50      0.28      0.36        25
           5       0.70      0.68      0.69       291
           6       0.65      0.75      0.70       432
           7       0.70      0.57      0.63       192
           8       0.60      0.43      0.50        35

    accuracy                           0.67       980
   macro avg       0.57      0.48      0.52       980
weighted avg       0.67      0.67      0.66       980



In [10]:
white_best_hp_2 = white_grid_search_2.best_params_
print(f"Best hp for 2nd model: {white_best_hp_2}")
print("---------------------------------")
print("Confusion Matrix:")
display(confusion_matrix(y_val_white, y_pred_white_2))
print("---------------------------------")
print(classification_report(y_val_white,y_pred_white_2))

Best hp for 2nd model: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}
---------------------------------
Confusion Matrix:


array([[  0,   0,   4,   1,   0,   0,   0],
       [  2,   9,   9,   5,   0,   0,   0],
       [  0,   6, 204,  77,   4,   0,   0],
       [  3,   0,  68, 326,  34,   0,   1],
       [  0,   1,   2,  71, 117,   1,   0],
       [  0,   0,   1,  11,   8,  15,   0],
       [  0,   0,   0,   0,   0,   0,   0]])

---------------------------------
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.56      0.36      0.44        25
           5       0.71      0.70      0.70       291
           6       0.66      0.75      0.71       432
           7       0.72      0.61      0.66       192
           8       0.94      0.43      0.59        35
           9       0.00      0.00      0.00         0

    accuracy                           0.68       980
   macro avg       0.51      0.41      0.44       980
weighted avg       0.69      0.68      0.68       980

