In [11]:
# library for data analysis
import pandas as pd

# library to clean (pre-porcess data)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt


# library to create model (classification algorithm)
from sklearn.ensemble import RandomForestClassifier

# library to evaluate models
from sklearn.metrics import  classification_report, accuracy_score


from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
wine = pd.read_csv('winequality-red.csv')

In [3]:
# The quality of a wine is based on a note from 3 to 7

wine.quality.value_counts()

quality
5    681
6    638
7    199
4     53
8     18
3     10
Name: count, dtype: int64

In [4]:
# CLEAN DATA
# Make quality colum just good or bad wines 
# More than 6 is a good wine

new_bins = (2, 6, 8) # 2 bins , split at 6 and goes until 8
category =['BAD','GOOD']

wine['quality']= pd.cut(wine['quality'], bins= new_bins, labels = category)
wine['quality'].unique()

['BAD', 'GOOD']
Categories (2, object): ['BAD' < 'GOOD']

In [5]:
# Make BAD=0  GOOD=1

label_quality = LabelEncoder()

wine['quality']=label_quality.fit_transform(wine['quality'])

In [6]:
# Split data into train and test

X = wine.drop('quality', axis=1)
y = wine['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size= 0.2)

In [7]:
# Scaling to improve the result

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)


In [9]:
# Create and train model

# Random Forest Algorithm,  Good for medium size data 

rfc_model = RandomForestClassifier(n_estimators=252) # 200 trees in the forest: 88.75 , 255 = 89.37

# Fit the model = train the model

rfc_model.fit(X_train,y_train)

# Generate Predictions

pred_rfc = rfc_model.predict(X_test)

In [10]:
# How accurate is our model ? 

print(accuracy_score(y_test,pred_rfc)* 100)

89.6875


In [12]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

print("Best parameters found: ", best_params)

# Use the best parameters to initialize the model
rfc_best = RandomForestClassifier(**best_params)

# Fit the model
rfc_best.fit(X_train, y_train)

# Evaluate the model
accuracy = rfc_best.score(X_test, y_test)
print("Accuracy with best parameters: ", accuracy)

Fitting 5 folds for each of 810 candidates, totalling 4050 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=

1350 fits failed out of a total of 4050.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1075 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.frame

Best parameters found:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Accuracy with best parameters:  0.890625


In [26]:
best_params = {
    'max_depth': 20,
    'max_features': 'sqrt',
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 400
}

# Create a new RandomForestClassifier with the best parameters
rfc_model = RandomForestClassifier(**best_params)

# Fit the model with your training data
rfc_model.fit(X_train, y_train)

# Evaluate the model with your test data
accuracy = rfc_model.score(X_test, y_test)
print(accuracy *100)


90.625
