In [1]:
# Import Dependencies
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load Dataset
dataset = pd.read_csv("winequality-red.csv", sep=";")
dataset['target'] = np.where(dataset['quality']<=5,0,1)

# Assign X and y
X = dataset.drop(["quality", "target"], axis=1)
y= dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
# Standard Scaler

In [4]:
# Pre-processing Standard Scaler
X_scaler= StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier
rf = RandomForestClassifier()
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_train_scaled, y_train)

# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_

# List Feature Importances, sorted
feature_importances = pd.DataFrame(rf.feature_importances_,  index = X_train.columns, columns = ['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
alcohol,0.197253
sulphates,0.128814
volatile acidity,0.106487
total sulfur dioxide,0.101907
density,0.091352
chlorides,0.074006
pH,0.063391
fixed acidity,0.063324
citric acid,0.061997
free sulfur dioxide,0.057015


In [5]:
predictions = rf.predict(X_test_scaled)
print(predictions)
print(rf.classes_)

[0 0 0 0 1 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 1
 1 0 1 0 0 1 0 1 1 1 1 0 1 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 1 1
 1 0 1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 1 1 1 0 1 0 1
 0 1 0 0 1 1 0 0 1 1 1 1 0 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 1 1 1 0 1 0 1 1
 1 0 1 1 0 0 1 0 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0
 1 1 1 1 0 0 1 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1
 1 1 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 1 1 1 1 0 1 1 0 1 0 0 0 1 1
 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 1 1 0 0 1
 0 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 0 1 1 0 0 1 0 1 1 1 0 0 0 1 1 0 1 1 1 0
 1 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 1 1 1 0 1 1 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0
 0 0 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 0 1 1 1 1 0 0 1]
[0 1]


In [6]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8075


In [8]:
from sklearn.metrics import classification_report

In [9]:
print(classification_report(y_test, predictions, zero_division=0))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78       178
           1       0.83      0.83      0.83       222

    accuracy                           0.81       400
   macro avg       0.81      0.80      0.81       400
weighted avg       0.81      0.81      0.81       400



In [10]:
# picked top 5 based on feature importance

In [11]:
X2 = dataset.drop(["quality", "target", 'chlorides', 'residual sugar', 'citric acid', 'pH', 'free sulfur dioxide', 'fixed acidity'], axis=1)
y2= dataset.target
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=42)

In [12]:
from sklearn.model_selection import GridSearchCV

In [15]:
# Pre-processing PowerTransformer Scaler
X_scaler2= StandardScaler().fit(X_train2)
X_train_scaled2 = X_scaler2.transform(X_train2)
X_test_scaled2 = X_scaler2.transform(X_test2)

# Create a random forest classifier
rf2 = RandomForestClassifier()

param_grid = {'n_estimators':[50, 100, 250, 500],
              'criterion': ['entropy', 'gini'],
              'max_depth': [125, 150, 175]}

grid = GridSearchCV(rf2, param_grid, verbose =3)

In [16]:
grid.fit(X_train_scaled2, y_train2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............
[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.779, total=   0.1s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............
[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.754, total=   0.2s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............
[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.825, total=   0.1s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.787, total=   0.1s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............
[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.862, total=   0.1s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.787, total=   0.3s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.729, total=   0.3s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.825, total=   0.2s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.812, total=   0.3s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.

[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.738, total=   1.2s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.825, total=   1.3s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.808, total=   1.3s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.854, total=   1.3s
[CV] criterion=gini, max_depth=125, n_estimators=50 ..................
[CV]  criterion=gini, max_depth=125, n_estimators=50, score=0.783, total=   0.1s
[CV] criterion=gini, max_depth=125, n_estimators=50 ..................
[CV]  criterion=gini, max_depth=125, n_estimators=50, score=0.758, total=   0.1s
[CV] criterion=gini, max_depth=125, n_estimators=50 ..................
[CV]  criterion=gini, max_depth=125, n_estimators=50, score=0.825, total

[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.783, total=   0.6s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.738, total=   0.7s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.829, total=   0.7s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.812, total=   0.7s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.858, total=   0.6s
[CV] criterion=gini, max_depth=175, n_estimators=500 .................
[CV]  criterion=gini, max_depth=175, n_estimators=500, score=0.779, total=   1.4s
[CV] criterion=gini, max_depth=175, n_estimators=500 .................
[CV]  criterion=gini, max_depth=175, n_estimators=500, score=0.742, total=   1.2s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.1min finished


GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [125, 150, 175],
                         'n_estimators': [50, 100, 250, 500]},
             verbose=3)

In [17]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'gini', 'max_depth': 125, 'n_estimators': 50}
0.8082287308228728


In [18]:
rf3 = RandomForestClassifier(criterion= 'gini', max_depth= 125, n_estimators=50)

In [19]:
rf3.fit(X_train_scaled2, y_train2)

RandomForestClassifier(max_depth=125, n_estimators=50)

In [20]:
print(f"Training Data Score: {rf3.score(X_train_scaled2, y_train2)}")
print(f"Testing Data Score: {rf3.score(X_test_scaled2, y_test2)}")

Training Data Score: 0.9991659716430359
Testing Data Score: 0.775


In [21]:
predictions = rf3.predict(X_test_scaled2)
print(predictions)
print(rf3.classes_)

[0 0 1 0 1 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 0 1 0 1 1 0 1 0 1
 1 1 1 0 0 1 0 1 1 1 1 0 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 0 1 1 1 0 1 0 1 1
 1 0 1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1
 0 1 0 0 1 0 0 0 1 1 1 0 0 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 1 1 1 0 1 0 1 1
 1 0 1 1 0 0 1 0 1 1 0 1 1 1 1 0 1 1 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 1 0
 1 1 1 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 1 1 0 1 0 1 0 0 1
 1 1 1 0 0 1 0 0 0 0 1 1 0 1 0 1 1 0 1 0 0 1 1 1 0 0 1 0 1 0 0 0 1 0 1 1 1
 1 1 1 0 1 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 1 0 0 0 1
 0 1 1 1 0 0 1 1 1 0 1 1 0 1 0 0 1 1 0 1 0 0 1 0 1 1 1 0 0 0 1 1 0 1 1 1 0
 1 1 0 1 1 0 0 1 1 1 0 1 0 1 0 1 1 1 1 0 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0 0 0
 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0 1 1 1 1 0 1]
[0 1]


In [22]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.74      0.75      0.75       178
           1       0.80      0.79      0.80       222

    accuracy                           0.78       400
   macro avg       0.77      0.77      0.77       400
weighted avg       0.78      0.78      0.78       400

