In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
data = pd.read_csv('wine+quality/winequality-white.csv', sep=';')

In [4]:
# Define quality classes based on the quality rating
data['quality_label'] = pd.cut(data['quality'], bins=[2, 5, 6, 8], labels=['low', 'medium', 'high'])

In [5]:
# Remove rows with missing values in 'quality_label'
data_cleaned = data.dropna(subset=['quality_label'])

In [6]:
# Prepare the data by separating features and the target label
X = data_cleaned.drop(['quality', 'quality_label'], axis=1)
y = data_cleaned['quality_label']

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Standardize the features to normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [10]:
# Predict and evaluate the model performance
y_pred = rf_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.78      0.57      0.66       206
         low       0.75      0.74      0.74       319
      medium       0.68      0.78      0.73       454

    accuracy                           0.72       979
   macro avg       0.74      0.69      0.71       979
weighted avg       0.73      0.72      0.72       979

[[118   8  80]
 [  2 235  82]
 [ 31  71 352]]


In [11]:
# Extract and print feature importances
feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

                      importance
alcohol                 0.125554
density                 0.109070
volatile acidity        0.103936
free sulfur dioxide     0.096721
residual sugar          0.089469
total sulfur dioxide    0.088168
chlorides               0.081294
pH                      0.080525
citric acid             0.079304
sulphates               0.074732
fixed acidity           0.071226
