In [1]:
# Import Dependencies
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load Dataset
dataset = pd.read_csv("winequality-white.csv", sep=";")
dataset['target'] = np.where(dataset['quality']<=5,0,1)

# Assign X and y
X = dataset.drop(["quality", "target"], axis=1)
y= dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
# Pre-processing PowerTransformer Scaler
X_scaler= PowerTransformer().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_train_scaled, y_train)

# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_

# List Feature Importances, sorted
feature_importances = pd.DataFrame(rf.feature_importances_,  index = X_train.columns, columns = ['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
alcohol,0.14589
volatile acidity,0.121581
density,0.102013
free sulfur dioxide,0.096532
total sulfur dioxide,0.088359
chlorides,0.082691
residual sugar,0.082024
citric acid,0.075858
pH,0.075033
fixed acidity,0.066072


In [5]:
predictions = rf.predict(X_test_scaled)
print(predictions)
print(rf.classes_)

[1 1 1 ... 1 1 1]
[0 1]


In [7]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8326530612244898


In [9]:
from sklearn.metrics import classification_report

In [10]:
print(classification_report(y_test, predictions, zero_division=0))

              precision    recall  f1-score   support

           0       0.77      0.69      0.73       397
           1       0.86      0.90      0.88       828

    accuracy                           0.83      1225
   macro avg       0.81      0.80      0.80      1225
weighted avg       0.83      0.83      0.83      1225



In [12]:
X2 = dataset.drop(["quality", "target", 'chlorides', 'residual sugar', 'citric acid', 'pH', 'fixed acidity', 'sulphates'], axis=1)
y2= dataset.target
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=42)

In [13]:
from sklearn.model_selection import GridSearchCV

In [20]:
# Pre-processing PowerTransformer Scaler
X_scaler2= PowerTransformer().fit(X_train2)
X_train_scaled2 = X_scaler2.transform(X_train2)
X_test_scaled2 = X_scaler2.transform(X_test2)

# Create a random forest classifier
rf2 = RandomForestClassifier()

param_grid = {'n_estimators':[50, 100, 250, 500],
              'criterion': ['entropy', 'gini'],
              'max_depth': [125, 150, 175]}

grid = GridSearchCV(rf, param_grid, verbose =3)

In [21]:
grid.fit(X_train_scaled2, y_train2)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.803, total=   0.3s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.820, total=   0.3s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.800, total=   0.2s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............
[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.819, total=   0.3s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............
[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.796, total=   0.3s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.811, total=   0.6s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.819, total=   0.5s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.808, total=   0.5s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.8

[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.805, total=   2.4s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.824, total=   2.3s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.812, total=   2.6s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.815, total=   3.0s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.804, total=   2.9s
[CV] criterion=gini, max_depth=125, n_estimators=50 ..................
[CV]  criterion=gini, max_depth=125, n_estimators=50, score=0.816, total=   0.3s
[CV] criterion=gini, max_depth=125, n_estimators=50 ..................
[CV]  criterion=gini, max_depth=125, n_estimators=50, score=0.839, t

[CV]  criterion=gini, max_depth=175, n_estimators=100, score=0.800, total=   0.5s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.811, total=   1.0s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.819, total=   1.0s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.812, total=   1.0s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.811, total=   1.2s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.792, total=   1.0s
[CV] criterion=gini, max_depth=175, n_estimators=500 .................
[CV]  criterion=gini, max_depth=175, n_estimators=500, score=0.803, total=   2.0s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  2.2min finished


GridSearchCV(estimator=RandomForestClassifier(n_estimators=200),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [125, 150, 175],
                         'n_estimators': [50, 100, 250, 500]},
             verbose=3)

In [22]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'gini', 'max_depth': 125, 'n_estimators': 50}
0.8151305862944633


In [24]:
rf3 = RandomForestClassifier(criterion= 'gini', max_depth= 125, n_estimators=50)

In [25]:
rf3.fit(X_train_scaled2, y_train2)

RandomForestClassifier(max_depth=125, n_estimators=50)

In [29]:
print(f"Training Data Score: {rf3.score(X_train_scaled2, y_train2)}")
print(f"Testing Data Score: {rf3.score(X_test_scaled2, y_test2)}")

Training Data Score: 0.999455485978764
Testing Data Score: 0.8163265306122449


In [27]:
predictions = rf3.predict(X_test_scaled2)
print(predictions)
print(rf3.classes_)

[1 1 1 ... 1 1 1]
[0 1]


In [28]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.74      0.68      0.70       397
           1       0.85      0.88      0.87       828

    accuracy                           0.82      1225
   macro avg       0.79      0.78      0.79      1225
weighted avg       0.81      0.82      0.81      1225

