In [6]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [7]:
# ID feature is already dropped from cleaning notebook
# Define the paths for all source files
cwd = os.getcwd()
orig_test_data_path = os.path.join(cwd,'wine_data','test.csv')
orig_train_data_path = os.path.join(cwd,'wine_data','train.csv')

# Open all source files as a pd dataframe
orig_test = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/test.csv')
orig_train = pd.read_csv('https://project4-wine-quality-2023.s3.us-west-2.amazonaws.com/train.csv')
red_data = pd.read_csv('red_data_noID.csv')
white_data = pd.read_csv('white_data_noID.csv')
red_white_data = pd.read_csv('red_white_data_noID.csv')
trainable_data = pd.read_csv('trainable_data_noID.csv')

# Drop 'ID' columns
orig_test = orig_test.drop(columns='Id')
orig_train = orig_train.drop(columns='Id')

In [8]:
# Split the data into training and testing sets
X_train = orig_train.drop('quality', axis=1)
y_train = orig_train['quality'].copy()
X_test = orig_test.copy()
# y_test -= 3
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42) #because it is the answer :)

In [9]:
from warnings import filterwarnings
filterwarnings('ignore')

# Create a Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'loss': ['ls', 'lad', 'huber']
}

# Perform grid search cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3)
grid_search.fit(X_train, y_train)





Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV 1/5] END learning_rate=0.01, loss=ls, max_depth=3, n_estimators=50, subsample=0.8;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.01, loss=ls, max_depth=3, n_estimators=50, subsample=0.8;, score=nan total time=   0.0s
[CV 3/5] END learning_rate=0.01, loss=ls, max_depth=3, n_estimators=50, subsample=0.8;, score=nan total time=   0.0s
[CV 4/5] END learning_rate=0.01, loss=ls, max_depth=3, n_estimators=50, subsample=0.8;, score=nan total time=   0.0s
[CV 5/5] END learning_rate=0.01, loss=ls, max_depth=3, n_estimators=50, subsample=0.8;, score=nan total time=   0.0s
[CV 1/5] END learning_rate=0.01, loss=ls, max_depth=3, n_estimators=50, subsample=1.0;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.01, loss=ls, max_depth=3, n_estimators=50, subsample=1.0;, score=nan total time=   0.0s
[CV 3/5] END learning_rate=0.01, loss=ls, max_depth=3, n_estimators=50, subsample=1.0;, score=nan total time=   0.0s
[

In [10]:
# Get the best parameters and the best model from the grid search 
best_params = grid_search.best_params_

# Train a model using the best parameters
best_model = GradientBoostingRegressor(random_state=42, **best_params)
best_model.fit(X_train, y_train)

In [11]:
print(f"Best Parameters: {best_params}")

# Make predictions on the test set
y_pred = best_model.predict(X_val)

print(f"Training R-Squared: {best_model.score(X_train, y_train)}")
print(f"Testing R-Squared: {best_model.score(X_val, y_val)}")
print("Mean Squared Error:", mean_squared_error(y_val, y_val))

Best Parameters: {'learning_rate': 0.1, 'loss': 'huber', 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
Training R-Squared: 0.47472582044135037
Testing R-Squared: 0.2998169938754842
Mean Squared Error: 0.0
