In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler

# Load the datasets
red_wine = pd.read_csv('winequality-red.csv', delimiter=';')
white_wine = pd.read_csv('winequality-white.csv', delimiter=';')

# Add a column for wine type
red_wine['wine_type'] = 0  # 0 for Red wine
white_wine['wine_type'] = 1  # 1 for White wine

# Combine datasets
wine_data = pd.concat([red_wine, white_wine], ignore_index=True)

# Clean column names (remove spaces)
wine_data.columns = wine_data.columns.str.strip()

# Transform quality into categories: (Bad=0, Average=1, Good=2)
wine_data['quality'] = wine_data['quality'].apply(lambda x: 0 if x <= 3 else (1 if x <= 5.5 else 2))

# Select top 6 important features
top_6_features = ['volatile acidity', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates', 'alcohol', 'residual sugar']
X = wine_data[top_6_features]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler for later use
joblib.dump(scaler, 'scaler.pkl')

print("Data preprocessing completed. Scaler saved as 'scaler.pkl'.")


Data preprocessing completed. Scaler saved as 'scaler.pkl'.


In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler

# Load the datasets
red_wine = pd.read_csv('winequality-red.csv', delimiter=';')
white_wine = pd.read_csv('winequality-white.csv', delimiter=';')

# Add a column for wine type
red_wine['wine_type'] = 0  # 0 for Red wine
white_wine['wine_type'] = 1  # 1 for White wine

# Combine datasets
wine_data = pd.concat([red_wine, white_wine], ignore_index=True)

# Clean column names (remove spaces)
wine_data.columns = wine_data.columns.str.strip()

# Transform quality into categories: (Bad=0, Average=1, Good=2)
wine_data['quality'] = wine_data['quality'].apply(lambda x: 0 if x <= 3 else (1 if x <= 5.5 else 2))

# Select top 6 important features
top_6_features = ['volatile acidity', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates', 'alcohol', 'residual sugar']
X = wine_data[top_6_features]
y = wine_data['quality']  # Target variable

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler for later use
joblib.dump(scaler, 'scaler.pkl')

print("Data preprocessing completed. Scaler saved as 'scaler.pkl'.")


✅ Data preprocessing completed. Scaler saved as 'scaler.pkl'.


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Define RandomForest model
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(rf_model, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model from GridSearch
best_rf = grid_search.best_estimator_

# Train the best model
best_rf.fit(X_train, y_train)

# Save the trained model
joblib.dump(best_rf, 'wine_quality_model.pkl')

# Model Evaluation
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f" Model trained successfully with Accuracy: {accuracy:.4f}")
print(" Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 108 candidates, totalling 540 fits
✅ Model trained successfully with Accuracy: 0.8292
📜 Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.79      0.73      0.76       471
           2       0.85      0.89      0.87       823

    accuracy                           0.83      1300
   macro avg       0.55      0.54      0.54      1300
weighted avg       0.82      0.83      0.83      1300



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
# Split data into features (X) and target (y) for wine type
y_type = wine_data['wine_type']  # 0 for Red, 1 for White

# Split data into training and testing sets (80% train, 20% test)
X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(X_scaled, y_type, test_size=0.2, random_state=42, stratify=y_type)

# Define RandomForest model for Wine Type
rf_model_type = RandomForestClassifier(random_state=42, n_jobs=-1)

# Hyperparameter tuning using GridSearchCV
param_grid_type = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_type = GridSearchCV(rf_model_type, param_grid_type, cv=5, n_jobs=-1, verbose=1)
grid_search_type.fit(X_train_type, y_train_type)

# Best model from GridSearch for Wine Type
best_rf_type = grid_search_type.best_estimator_

# Train the best model
best_rf_type.fit(X_train_type, y_train_type)

# Save the trained wine type model
joblib.dump(best_rf_type, 'wine_type_model.pkl')

# Model Evaluation for Wine Type
y_pred_type = best_rf_type.predict(X_test_type)
accuracy_type = accuracy_score(y_test_type, y_pred_type)

print(f" Wine Type Model trained successfully with Accuracy: {accuracy_type:.4f}")
print(" Classification Report for Wine Type:\n", classification_report(y_test_type, y_pred_type))


Fitting 5 folds for each of 108 candidates, totalling 540 fits
✅ Wine Type Model trained successfully with Accuracy: 0.9908
📜 Classification Report for Wine Type:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       320
           1       1.00      0.99      0.99       980

    accuracy                           0.99      1300
   macro avg       0.99      0.99      0.99      1300
weighted avg       0.99      0.99      0.99      1300

