In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

In [2]:
# ID feature is already dropped from cleaning notebook
# Define the paths for all source files
cwd = os.getcwd()
orig_test_data_path = os.path.join(cwd,'wine_data','test.csv')
orig_train_data_path = os.path.join(cwd,'wine_data','train.csv')

# Open all source files as a pd dataframe
orig_test = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/test.csv')
orig_train = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/train.csv')
red_data = pd.read_csv('red_data_noID.csv')
white_data = pd.read_csv('white_data_noID.csv')
red_white_data = pd.read_csv('red_white_data_noID.csv')
trainable_data = pd.read_csv('trainable_data_noID.csv')

# Drop 'ID' columns
orig_test = orig_test.drop(columns='Id')
orig_train = orig_train.drop(columns='Id')

In [3]:
# Split the data into training and testing sets
X_train_redwhite = red_white_data.drop('quality', axis=1)
y_train_redwhite = red_white_data['quality'].copy()
X_test_redwhite = red_white_data.copy()
# y_test_redwhite -= 3
X_train_redwhite, X_val_redwhite, y_train_redwhite, y_val_redwhite = train_test_split(X_train_redwhite, 
                                                  y_train_redwhite, 
                                                  test_size = 0.2, 
                                                  random_state = 42) #because it is the answer :)

In [4]:
from warnings import filterwarnings
filterwarnings('ignore')

# Create a Gradient Boosting Regressor
redwhite_model = GradientBoostingClassifier()

# Define the hyperparameter grid to search
redwhite_param_grid = {
    'n_estimators': [50, 100, 200], # number of weak learning trees
    'learning_rate': [0.01, 0.1, 0.2], # control possibility of overfitting.
    # Strong interaction with n_estimators: smaller learning rates require more n_estimators
    'max_depth': [3, 5, 7] #  size of each tree
}

# Perform grid search with cross-validation
redwhite_grid_search = GridSearchCV(redwhite_model, redwhite_param_grid, cv=5, verbose=3)
redwhite_grid_search.fit(X_train_redwhite, y_train_redwhite)

# Get the best model from grid search
redwhite_best_gb = redwhite_grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred_redwhite = redwhite_best_gb.predict(X_val_redwhite)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.541 total time=   2.5s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.532 total time=   2.5s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.505 total time=   2.6s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.535 total time=   2.5s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.537 total time=   2.8s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.550 total time=   5.1s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.540 total time=   5.1s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.513 total time=   5.1s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.557 total time=   5.1s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.552 to

In [5]:
# Calculate metrics
redwhite_accuracy = accuracy_score(y_val_redwhite, y_pred_redwhite)
# the following metrics need to include the average type since the validation is ordinal and not binary
redwhite_f1 = f1_score(y_val_redwhite, y_pred_redwhite, average= 'weighted')
redwhite_precision = precision_score(y_val_redwhite, y_pred_redwhite, average= 'weighted')
redwhite_recall = recall_score(y_val_redwhite, y_pred_redwhite, average= 'weighted')

print(f"Best Estimators: {redwhite_best_gb.n_estimators}")
print(f"Best Learning Rate: {redwhite_best_gb.learning_rate}")
print(f"Best Max Depth: {redwhite_best_gb.max_depth}")
print(f"Accuracy: {redwhite_accuracy:.2f}")
print(f"F1: {redwhite_f1:.2f}")
print(f"Precision: {redwhite_precision:.2f}")
print(f"Recall: {redwhite_recall:.2f}")

Best Estimators: 100
Best Learning Rate: 0.1
Best Max Depth: 7
Accuracy: 0.67
F1: 0.67
Precision: 0.67
Recall: 0.67


In [6]:
# Create a Gradient Boosting Regressor
redwhite_model_2 = GradientBoostingClassifier()

# Define the hyperparameter grid to search
redwhite_param_grid_2 = {
    'n_estimators': [50, 100, 200], # number of weak learning trees
    'learning_rate': [0.01, 0.1, 0.2], # control possibility of overfitting.
    # Strong interaction with n_estimators: smaller learning rates require more n_estimators
    'max_depth': [5, 7, 9] #  size of each tree
}

# Perform grid search with cross-validation
redwhite_grid_search_2 = GridSearchCV(redwhite_model_2, redwhite_param_grid_2, cv=5, verbose=3)
redwhite_grid_search_2.fit(X_train_redwhite, y_train_redwhite)

# Get the best model from grid search
redwhite_best_gb_2 = redwhite_grid_search_2.best_estimator_

# Make predictions on the test data using the best model
y_pred_redwhite_2 = redwhite_best_gb_2.predict(X_val_redwhite)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.558 total time=   4.3s
[CV 2/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.552 total time=   4.2s
[CV 3/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.520 total time=   4.2s
[CV 4/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.543 total time=   4.5s
[CV 5/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.558 total time=   4.3s
[CV 1/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.567 total time=   8.6s
[CV 2/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.590 total time=   8.3s
[CV 3/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.556 total time=   8.6s
[CV 4/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.573 total time=   8.4s
[CV 5/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.583 to

In [7]:
# Calculate metrics
redwhite_accuracy_2 = accuracy_score(y_val_redwhite, y_pred_redwhite_2)
# the following metrics need to include the average type since the validation is ordinal and not binary
redwhite_f1_2 = f1_score(y_val_redwhite, y_pred_redwhite_2, average= 'weighted')
redwhite_precision_2 = precision_score(y_val_redwhite, y_pred_redwhite_2, average= 'weighted')
redwhite_recall_2 = recall_score(y_val_redwhite, y_pred_redwhite_2, average= 'weighted')

print(f"Best Estimators: {redwhite_best_gb_2.n_estimators}")
print(f"Best Learning Rate: {redwhite_best_gb_2.learning_rate}")
print(f"Best Max Depth: {redwhite_best_gb_2.max_depth}")
print(f"Accuracy: {redwhite_accuracy_2:.2f}")
print(f"F1: {redwhite_f1_2:.2f}")
print(f"Precision: {redwhite_precision_2:.2f}")
print(f"Recall: {redwhite_recall_2:.2f}")

Best Estimators: 200
Best Learning Rate: 0.2
Best Max Depth: 9
Accuracy: 0.67
F1: 0.67
Precision: 0.68
Recall: 0.67


In [8]:
redwhite_best_hp = redwhite_grid_search.best_params_
print(f"Best hp for 1st model: {redwhite_best_hp}")
print("---------------------------------")
print("Confusion Matrix:")
display(confusion_matrix(y_val_redwhite, y_pred_redwhite))
print("---------------------------------")
print(classification_report(y_val_redwhite,y_pred_redwhite))

Best hp for 1st model: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
---------------------------------
Confusion Matrix:


array([[  1,   0,   2,   2,   1,   0,   0],
       [  1,   9,  24,   9,   0,   0,   0],
       [  1,   4, 288, 103,   6,   0,   0],
       [  2,   4, 109, 444,  35,   3,   0],
       [  0,   0,   6,  85, 120,   3,   1],
       [  0,   0,   0,  11,  12,  13,   0],
       [  0,   0,   0,   1,   0,   0,   0]])

---------------------------------
              precision    recall  f1-score   support

           3       0.20      0.17      0.18         6
           4       0.53      0.21      0.30        43
           5       0.67      0.72      0.69       402
           6       0.68      0.74      0.71       597
           7       0.69      0.56      0.62       215
           8       0.68      0.36      0.47        36
           9       0.00      0.00      0.00         1

    accuracy                           0.67      1300
   macro avg       0.49      0.39      0.42      1300
weighted avg       0.67      0.67      0.67      1300



In [9]:
redwhite_best_hp_2 = redwhite_grid_search_2.best_params_
print(f"Best hp for 2nd model: {redwhite_best_hp_2}")
print("---------------------------------")
print("Confusion Matrix:")
display(confusion_matrix(y_val_redwhite, y_pred_redwhite_2))
print("---------------------------------")
print(classification_report(y_val_redwhite,y_pred_redwhite_2))

Best hp for 2nd model: {'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 200}
---------------------------------
Confusion Matrix:


array([[  0,   0,   2,   2,   1,   0,   1],
       [  1,   9,  24,   9,   0,   0,   0],
       [  2,   2, 289, 103,   6,   0,   0],
       [  3,   4, 106, 441,  41,   1,   1],
       [  0,   0,   3,  86, 124,   1,   1],
       [  0,   0,   0,  12,  11,  13,   0],
       [  0,   0,   0,   1,   0,   0,   0]])

---------------------------------
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         6
           4       0.60      0.21      0.31        43
           5       0.68      0.72      0.70       402
           6       0.67      0.74      0.71       597
           7       0.68      0.58      0.62       215
           8       0.87      0.36      0.51        36
           9       0.00      0.00      0.00         1

    accuracy                           0.67      1300
   macro avg       0.50      0.37      0.41      1300
weighted avg       0.68      0.67      0.67      1300

