In [3]:
# First import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

print("Libraries imported successfully!")

# Now download the datasets
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv

print("\nDatasets downloaded successfully!")

Libraries imported successfully!
--2025-07-09 11:22:56--  https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘winequality-red.csv.1’

winequality-red.csv     [ <=>                ]  82.23K  --.-KB/s    in 0.1s    

2025-07-09 11:22:57 (560 KB/s) - ‘winequality-red.csv.1’ saved [84199]

--2025-07-09 11:22:57--  https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘winequality-white.csv.1’

winequality-white.c     [ <=>                ] 258.2

In [4]:
# Load the data (note the semicolon delimiter)
df_red = pd.read_csv('winequality-red.csv', delimiter=';')
df_white = pd.read_csv('winequality-white.csv', delimiter=';')

# Display basic info
print("Red wine dataset shape:", df_red.shape)
print("White wine dataset shape:", df_white.shape)

# Let's work with red wine first (you can change to white if preferred)
df = df_red.copy()

# Create binary classification target (quality > 6 = good wine)
df['quality_class'] = (df['quality'] > 6).astype(int)

# Show class distribution
print("\nClass distribution:")
print(df['quality_class'].value_counts())

# Prepare features and target
X = df.drop(['quality', 'quality_class'], axis=1)
y = df['quality_class']

Red wine dataset shape: (1599, 12)
White wine dataset shape: (4898, 12)

Class distribution:
quality_class
0    1382
1     217
Name: count, dtype: int64


In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))

Training set size: (1119, 11)
Testing set size: (480, 11)

Class distribution in training set:
quality_class
0    0.864164
1    0.135836
Name: proportion, dtype: float64


In [6]:
# Initialize and train Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_rf)*100))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 93.12%

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96       415
           1       0.92      0.54      0.68        65

    accuracy                           0.93       480
   macro avg       0.93      0.77      0.82       480
weighted avg       0.93      0.93      0.92       480



In [7]:
# Initialize and train Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)

# Evaluate
y_pred_gb = gb_model.predict(X_test)
print("Gradient Boosting Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_gb)*100))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gb))

Gradient Boosting Accuracy: 92.50%

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       415
           1       0.81      0.58      0.68        65

    accuracy                           0.93       480
   macro avg       0.87      0.78      0.82       480
weighted avg       0.92      0.93      0.92       480



In [9]:
# Feature importance
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Top 5 Important Features:")
display(importances.head(5))

# Sample prediction
sample_idx = 10  # Try different indices
sample = X_test.iloc[sample_idx:sample_idx+1]
true_label = "Good" if y_test.iloc[sample_idx] == 1 else "Average"
pred_label = "Good" if rf_model.predict(sample)[0] == 1 else "Average"

print("\nSample Prediction:")
print("True quality class:", true_label)
print("Predicted quality class:", pred_label)
print("\nSample features:")
display(sample)

Top 5 Important Features:


Unnamed: 0,Feature,Importance
10,alcohol,0.173436
9,sulphates,0.107883
1,volatile acidity,0.104493
7,density,0.097152
2,citric acid,0.092555



Sample Prediction:
True quality class: Average
Predicted quality class: Average

Sample features:


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1215,8.8,0.27,0.46,2.1,0.095,20.0,29.0,0.99488,3.26,0.56,11.3



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

