# Gradient Boosting Classifier Model
---
The model in this notebook was strongly influenced by Natural Chan.  The following gradient boosted classifier will be used in an ordinal regression dataset.

In [15]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

In [2]:
# Save the following code for later...will remove upon final revision

# kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic') 
# grid_search = GridSearchCV(xgb, param_grid, cv=5, verbose = 3, scoring=kappa_scorer)

In [3]:
# ID feature is already dropped from cleaning notebook
# Define the paths for all source files
cwd = os.getcwd()
orig_test_data_path = os.path.join(cwd,'wine_data','test.csv')
orig_train_data_path = os.path.join(cwd,'wine_data','train.csv')

# Open all source files as a pd dataframe
orig_test = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/test.csv')
orig_train = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/train.csv')
red_data = pd.read_csv('red_data_noID.csv')
white_data = pd.read_csv('white_data_noID.csv')
red_white_data = pd.read_csv('red_white_data_noID.csv')
trainable_data = pd.read_csv('trainable_data_noID.csv')

# Drop 'ID' columns
orig_test = orig_test.drop(columns='Id')
orig_train = orig_train.drop(columns='Id')

In [4]:
orig_train.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [5]:
# Split the data into training and testing sets
X_train = orig_train.drop('quality', axis=1)
y_train = orig_train['quality'].copy()
X_test = orig_test.copy()
# y_test -= 3
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42) #because it is the answer :)

In [8]:
from warnings import filterwarnings
filterwarnings('ignore')

# Create a Gradient Boosting Regressor
model = GradientBoostingClassifier()

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200], # number of weak learning trees
    'learning_rate': [0.01, 0.1, 0.2], # control possibility of overfitting.
    # Strong interaction with n_estimators: smaller learning rates require more n_estimators
    'max_depth': [3, 5, 7] #  size of each tree
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_gb = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_gb.predict(X_val)



In [10]:
# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
# the following metrics need to include the average type since the validation is ordinal and not binary
f1 = f1_score(y_val, y_pred, average= 'weighted')
precision = precision_score(y_val, y_pred, average= 'weighted')
recall = recall_score(y_val, y_pred, average= 'weighted')

print(f"Best Estimators: {best_gb.n_estimators}")
print(f"Best Learning Rate: {best_gb.learning_rate}")
print(f"Best Max Depth: {best_gb.max_depth}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Best Estimators: 200
Best Learning Rate: 0.01
Best Max Depth: 3
Accuracy: 0.59
F1: 0.57
Precision: 0.59
Recall: 0.59


In [11]:
# Change the number of weak learner trees
# Create a Gradient Boosting Regressor
model = GradientBoostingClassifier(n_classes = len(y_train.unique()))

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [100, 200, 400],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_gb = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_gb.predict(X_val)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.556 total time=   3.5s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.574 total time=   3.3s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.611 total time=   3.3s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.599 total time=   3.3s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.601 total time=   3.3s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=400;, score=0.578 total time=   6.6s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=400;, score=0.568 total time=   6.7s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=400;, score=0.605 total time=   6.9s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=400;, score=0.614 total time=   7.6s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=400;, score=0.5

In [12]:
# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
# the following metrics need to include the average type since the validation is ordinal and not binary
f1 = f1_score(y_val, y_pred, average= 'weighted')
precision = precision_score(y_val, y_pred, average= 'weighted')
recall = recall_score(y_val, y_pred, average= 'weighted')

print(f"Best Estimators: {best_gb.n_estimators}")
print(f"Best Learning Rate: {best_gb.learning_rate}")
print(f"Best Max Depth: {best_gb.max_depth}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Best Estimators: 400
Best Learning Rate: 0.01
Best Max Depth: 3
Accuracy: 0.59
F1: 0.57
Precision: 0.58
Recall: 0.59


In [13]:
best_hp = grid_search.best_params_
best_hp

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 400}

In [14]:

confusion_matrix(y_val, y_pred)

array([[  0,   0,   1,   1,   0,   0],
       [  0,   1,   4,   2,   1,   0],
       [  0,   0, 123,  43,   2,   1],
       [  0,   0,  50,  94,  14,   0],
       [  0,   0,   8,  37,  24,   0],
       [  0,   0,   2,   3,   1,   0]])

In [17]:
class_min = y_train.min()
class_min

3

In [13]:
# Change the number of weak learner trees
# Create a Gradient Boosting Regressor
n_classes = len(y_train.unique())
model = GradientBoostingClassifier('n_classes_' = n_classes)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [100, 200, 400],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_gb = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_gb.predict(X_val)

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (3989848082.py, line 4)