In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

In [2]:
# ID feature is already dropped from cleaning notebook
# Define the paths for all source files
cwd = os.getcwd()
orig_test_data_path = os.path.join(cwd,'wine_data','test.csv')
orig_train_data_path = os.path.join(cwd,'wine_data','train.csv')

# Open all source files as a pd dataframe
orig_test = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/test.csv')
orig_train = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/train.csv')
red_data = pd.read_csv('red_data_noID.csv')
white_data = pd.read_csv('white_data_noID.csv')
red_white_data = pd.read_csv('red_white_data_noID.csv')
trainable_data = pd.read_csv('trainable_data_noID.csv')

# Drop 'ID' columns
orig_test = orig_test.drop(columns='Id')
orig_train = orig_train.drop(columns='Id')

In [3]:
# Split the data into training and testing sets
X_train_red = red_data.drop('quality', axis=1)
y_train_red = red_data['quality'].copy()
X_test_red = red_data.copy()
# y_test_red -= 3
X_train_red, X_val_red, y_train_red, y_val_red = train_test_split(X_train_red, 
                                                  y_train_red, 
                                                  test_size = 0.2, 
                                                  random_state = 42) #because it is the answer :)

In [5]:
from warnings import filterwarnings
filterwarnings('ignore')

# Create a Gradient Boosting Regressor
red_model = GradientBoostingClassifier()

# Define the hyperparameter grid to search
red_param_grid = {
    'n_estimators': [50, 100, 200], # number of weak learning trees
    'learning_rate': [0.01, 0.1, 0.2], # control possibility of overfitting.
    # Strong interaction with n_estimators: smaller learning rates require more n_estimators
    'max_depth': [3, 5, 7] #  size of each tree
}

# Perform grid search with cross-validation
red_grid_search = GridSearchCV(red_model, red_param_grid, cv=5, verbose=3)
red_grid_search.fit(X_train_red, y_train_red)

# Get the best model from grid search
red_best_gb = red_grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred_red = red_best_gb.predict(X_val_red)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.582 total time=   0.7s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.551 total time=   0.7s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.590 total time=   0.7s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.582 total time=   0.7s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.647 total time=   0.7s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.605 total time=   1.5s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.582 total time=   1.5s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.598 total time=   1.4s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.621 total time=   1.5s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.671 to

In [6]:
# Calculate metrics
red_accuracy = accuracy_score(y_val_red, y_pred_red)
# the following metrics need to include the average type since the validation is ordinal and not binary
red_f1 = f1_score(y_val_red, y_pred_red, average= 'weighted')
red_precision = precision_score(y_val_red, y_pred_red, average= 'weighted')
red_recall = recall_score(y_val_red, y_pred_red, average= 'weighted')

print(f"Best Estimators: {red_best_gb.n_estimators}")
print(f"Best Learning Rate: {red_best_gb.learning_rate}")
print(f"Best Max Depth: {red_best_gb.max_depth}")
print(f"Accuracy: {red_accuracy:.2f}")
print(f"F1: {red_f1:.2f}")
print(f"Precision: {red_precision:.2f}")
print(f"Recall: {red_recall:.2f}")

Best Estimators: 50
Best Learning Rate: 0.1
Best Max Depth: 7
Accuracy: 0.62
F1: 0.61
Precision: 0.61
Recall: 0.62


In [7]:
# Create a Gradient Boosting Regressor
red_model_2 = GradientBoostingClassifier()

# Define the hyperparameter grid to search
red_param_grid_2 = {
    'n_estimators': [50, 100, 200], # number of weak learning trees
    'learning_rate': [0.01, 0.1, 0.2], # control possibility of overfitting.
    # Strong interaction with n_estimators: smaller learning rates require more n_estimators
    'max_depth': [5, 7, 9] #  size of each tree
}

# Perform grid search with cross-validation
red_grid_search_2 = GridSearchCV(red_model_2, red_param_grid_2, cv=5, verbose=3)
red_grid_search_2.fit(X_train_red, y_train_red)

# Get the best model from grid search
red_best_gb_2 = red_grid_search_2.best_estimator_

# Make predictions on the test data using the best model
y_pred_red_2 = red_best_gb_2.predict(X_val_red)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.621 total time=   1.3s
[CV 2/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.609 total time=   1.2s
[CV 3/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.598 total time=   1.3s
[CV 4/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.629 total time=   1.3s
[CV 5/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.690 total time=   1.2s
[CV 1/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.637 total time=   2.6s
[CV 2/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.648 total time=   2.5s
[CV 3/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.637 total time=   2.5s
[CV 4/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.660 total time=   2.6s
[CV 5/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.690 to

In [8]:
# Calculate metrics
red_accuracy_2 = accuracy_score(y_val_red, y_pred_red_2)
# the following metrics need to include the average type since the validation is ordinal and not binary
red_f1_2 = f1_score(y_val_red, y_pred_red_2, average= 'weighted')
red_precision_2 = precision_score(y_val_red, y_pred_red_2, average= 'weighted')
red_recall_2 = recall_score(y_val_red, y_pred_red_2, average= 'weighted')

print(f"Best Estimators: {red_best_gb_2.n_estimators}")
print(f"Best Learning Rate: {red_best_gb_2.learning_rate}")
print(f"Best Max Depth: {red_best_gb_2.max_depth}")
print(f"Accuracy: {red_accuracy_2:.2f}")
print(f"F1: {red_f1_2:.2f}")
print(f"Precision: {red_precision_2:.2f}")
print(f"Recall: {red_recall_2:.2f}")

Best Estimators: 200
Best Learning Rate: 0.1
Best Max Depth: 9
Accuracy: 0.63
F1: 0.63
Precision: 0.63
Recall: 0.63


In [18]:
red_best_hp = red_grid_search.best_params_
print(f"Best hp for 1st model: {red_best_hp}")
print("---------------------------------")
print("Confusion Matrix:")
display(confusion_matrix(y_val_red, y_pred_red))

print("---------------------------------")
print(classification_report(y_val_red,y_pred_red))

Best hp for 1st model: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50}
---------------------------------
Confusion Matrix:


array([[ 0,  0,  1,  0,  0,  0],
       [ 1,  1,  4,  4,  0,  0],
       [ 1,  1, 88, 38,  2,  0],
       [ 1,  0, 30, 90, 10,  1],
       [ 0,  0,  2, 20, 18,  2],
       [ 0,  0,  0,  3,  2,  0]])

---------------------------------
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.50      0.10      0.17        10
           5       0.70      0.68      0.69       130
           6       0.58      0.68      0.63       132
           7       0.56      0.43      0.49        42
           8       0.00      0.00      0.00         5

    accuracy                           0.62       320
   macro avg       0.39      0.31      0.33       320
weighted avg       0.61      0.62      0.61       320



In [19]:
red_best_hp_2 = red_grid_search_2.best_params_
print(f"Best hp for 1st model: {red_best_hp_2}")
print("---------------------------------")
print("Confusion Matrix:")
display(confusion_matrix(y_val_red, y_pred_red_2))
print("---------------------------------")
print(classification_report(y_val_red,y_pred_red_2))

Best hp for 1st model: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 200}
---------------------------------
Confusion Matrix:


array([[ 0,  0,  1,  0,  0,  0],
       [ 0,  1,  6,  3,  0,  0],
       [ 0,  0, 94, 36,  0,  0],
       [ 0,  1, 34, 87,  7,  3],
       [ 0,  0,  1, 19, 21,  1],
       [ 0,  0,  0,  3,  2,  0]])

---------------------------------
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.50      0.10      0.17        10
           5       0.69      0.72      0.71       130
           6       0.59      0.66      0.62       132
           7       0.70      0.50      0.58        42
           8       0.00      0.00      0.00         5

    accuracy                           0.63       320
   macro avg       0.41      0.33      0.35       320
weighted avg       0.63      0.63      0.63       320

