In [1]:
# Import Dependencies
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load Dataset
dataset = pd.read_csv("winequality-red.csv", sep=";")
dataset['target'] = np.where(dataset['quality']<=5,0,1)

# Assign X and y
X = dataset.drop(["quality", "target"], axis=1)
y= dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
# Standard Scaler

In [4]:
# Pre-processing Standard Scaler
X_scaler= StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier
rf = RandomForestClassifier()
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_train_scaled, y_train)

# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_

# List Feature Importances, sorted
feature_importances = pd.DataFrame(rf.feature_importances_,  index = X_train.columns, columns = ['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
alcohol,0.192708
sulphates,0.128387
volatile acidity,0.105322
total sulfur dioxide,0.104593
density,0.092455
chlorides,0.074051
fixed acidity,0.066087
pH,0.063401
citric acid,0.06253
free sulfur dioxide,0.055298


In [5]:
predictions = rf.predict(X_test_scaled)
print(predictions)
print(rf.classes_)

[0 0 0 0 1 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 1 1 1 0 0 1
 1 0 1 0 0 1 0 1 1 1 1 0 1 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 1 1
 1 0 1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 1 0 0 0 0 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1
 0 1 0 0 1 1 0 0 1 1 1 1 0 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 1 1 1 0 1 0 1 1
 1 0 1 1 0 0 1 0 1 1 0 1 1 1 1 0 1 1 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0
 1 1 1 1 0 0 1 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 1 1 1 1 0 1 1 1 1
 1 1 1 0 0 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 1
 1 1 1 0 1 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 1 1 0 0 1
 0 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 0 0 0 1 1 0 1 1 1 0
 1 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 1 1 1 0 1 1 1 0 0 0 0 1 0 0 1 0 1 1 0 0 0
 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 0 1 1 1 1 0 0 1]
[0 1]


In [6]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8


In [7]:
from sklearn.metrics import classification_report

In [8]:
print(classification_report(y_test, predictions, zero_division=0))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77       178
           1       0.81      0.83      0.82       222

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400



In [9]:
# picked top 5 based on feature importance

In [10]:
X2 = dataset.drop(["quality", "target", 'chlorides', 'residual sugar', 'citric acid', 'pH', 'free sulfur dioxide', 'fixed acidity'], axis=1)
y2= dataset.target
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=42)

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
# Pre-processing PowerTransformer Scaler
X_scaler2= StandardScaler().fit(X_train2)
X_train_scaled2 = X_scaler2.transform(X_train2)
X_test_scaled2 = X_scaler2.transform(X_test2)

# Create a random forest classifier
rf2 = RandomForestClassifier()

param_grid = {'n_estimators':[50, 100, 250, 500],
              'criterion': ['entropy', 'gini'],
              'max_depth': [125, 150, 175]}

grid = GridSearchCV(rf2, param_grid, verbose =3)

In [13]:
grid.fit(X_train_scaled2, y_train2)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............
[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.796, total=   0.2s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.733, total=   0.2s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.838, total=   0.2s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............
[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.804, total=   0.1s
[CV] criterion=entropy, max_depth=125, n_estimators=50 ...............
[CV]  criterion=entropy, max_depth=125, n_estimators=50, score=0.845, total=   0.2s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.775, total=   0.3s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.746, total=   0.3s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.833, total=   0.3s
[CV] criterion=entropy, max_depth=125, n_estimators=100 ..............
[CV]  criterion=entropy, max_depth=125, n_estimators=100, score=0.8

[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.787, total=   1.1s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.738, total=   1.2s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.833, total=   1.3s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.808, total=   1.2s
[CV] criterion=entropy, max_depth=175, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=175, n_estimators=500, score=0.854, total=   1.2s
[CV] criterion=gini, max_depth=125, n_estimators=50 ..................
[CV]  criterion=gini, max_depth=125, n_estimators=50, score=0.771, total=   0.1s
[CV] criterion=gini, max_depth=125, n_estimators=50 ..................
[CV]  criterion=gini, max_depth=125, n_estimators=50, score=0.729, t

[CV]  criterion=gini, max_depth=175, n_estimators=100, score=0.858, total=   0.2s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.796, total=   0.5s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.738, total=   0.5s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.838, total=   0.5s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.800, total=   0.5s
[CV] criterion=gini, max_depth=175, n_estimators=250 .................
[CV]  criterion=gini, max_depth=175, n_estimators=250, score=0.858, total=   0.5s
[CV] criterion=gini, max_depth=175, n_estimators=500 .................
[CV]  criterion=gini, max_depth=175, n_estimators=500, score=0.787, total=   1.0s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.3min finished


GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [125, 150, 175],
                         'n_estimators': [50, 100, 250, 500]},
             verbose=3)

In [14]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'entropy', 'max_depth': 150, 'n_estimators': 50}
0.811544630404463


In [15]:
rf3 = RandomForestClassifier(criterion= 'gini', max_depth= 125, n_estimators=50)

In [16]:
rf3.fit(X_train_scaled2, y_train2)

RandomForestClassifier(max_depth=125, n_estimators=50)

In [17]:
print(f"Training Data Score: {rf3.score(X_train_scaled2, y_train2)}")
print(f"Testing Data Score: {rf3.score(X_test_scaled2, y_test2)}")

Training Data Score: 1.0
Testing Data Score: 0.78


In [18]:
predictions = rf3.predict(X_test_scaled2)
print(predictions)
print(rf3.classes_)

[0 0 1 0 1 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 0 1 0 1 1 0 0 0 1
 1 1 1 0 0 1 0 0 1 1 1 0 1 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1
 1 0 1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 1 1 0 0 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1
 0 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1
 1 0 1 1 0 0 1 0 0 1 0 1 1 1 1 0 1 1 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 0 1 0
 1 1 1 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 1 1 0 1 0 1 1 0 1
 1 1 1 0 0 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 1 1 1 0 1 1 0 1 0 0 0 1 0 0 1 1
 1 1 0 0 1 1 1 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 1 0 0 0 1
 0 1 1 1 0 0 1 0 1 0 1 1 0 1 0 0 0 1 0 1 0 0 1 0 1 1 1 0 0 0 1 1 0 1 1 1 0
 1 1 0 1 1 0 0 1 1 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0 0 0
 0 0 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0 1 1 1 0 0 1]
[0 1]


In [19]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.74      0.78      0.76       178
           1       0.81      0.78      0.80       222

    accuracy                           0.78       400
   macro avg       0.78      0.78      0.78       400
weighted avg       0.78      0.78      0.78       400



In [20]:
import joblib

In [21]:
filename = 'RandomForest_Red.sav'
joblib.dump(rf3, filename)

['RandomForest_Red.sav']

In [22]:
loaded_model= joblib.load(filename)

result=loaded_model.score(X_test_scaled2, y_test2)

print(result)

0.78
