<span style="font-size: 40px; color: red">Red Wine Quality Prediction using regression model</span>

<span style="font-size: 40px; color: orange">Objective:</span>
* Develop a machine learning model that predicts the quality of wine based on its chemical attributes.


<span style="font-size: 30px; color: green">Import Libraries</span>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from sklearn.feature_selection import RFECV

import joblib

<span style="font-size: 30px; color: green">Load the Dataset</span>

In [2]:
data = pd.read_csv("winequality-red.csv", sep=";")

In [3]:
# Remove spaces and rename columns
data.columns = data.columns.str.replace(' ', '_')

In [4]:
data.columns

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [6]:
data.min()

fixed_acidity           4.60000
volatile_acidity        0.12000
citric_acid             0.00000
residual_sugar          0.90000
chlorides               0.01200
free_sulfur_dioxide     1.00000
total_sulfur_dioxide    6.00000
density                 0.99007
pH                      2.74000
sulphates               0.33000
alcohol                 8.40000
quality                 3.00000
dtype: float64

In [7]:
data.max()

fixed_acidity            15.90000
volatile_acidity          1.58000
citric_acid               1.00000
residual_sugar           15.50000
chlorides                 0.61100
free_sulfur_dioxide      72.00000
total_sulfur_dioxide    289.00000
density                   1.00369
pH                        4.01000
sulphates                 2.00000
alcohol                  14.90000
quality                   8.00000
dtype: float64

<span style="font-size: 30px; color: green">Data Preprocessing</span>

<span style="font-size: 20px; color: blue">Handling Missing data</span>

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         1599 non-null   float64
 1   volatile_acidity      1599 non-null   float64
 2   citric_acid           1599 non-null   float64
 3   residual_sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free_sulfur_dioxide   1599 non-null   float64
 6   total_sulfur_dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         1599 non-null   float64
 1   volatile_acidity      1599 non-null   float64
 2   citric_acid           1599 non-null   float64
 3   residual_sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free_sulfur_dioxide   1599 non-null   float64
 6   total_sulfur_dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


<span style="font-size: 20px; color: blue">Feature Scaling</span>

In [6]:
scaler = StandardScaler()

data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [7]:
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,-0.787823
1,-0.298547,1.967442,-1.391472,0.043416,0.223875,0.872638,0.624363,0.028261,-0.719933,0.12895,-0.584777,-0.787823
2,-0.298547,1.297065,-1.18607,-0.169427,0.096353,-0.083669,0.229047,0.134264,-0.331177,-0.048089,-0.584777,-0.787823
3,1.654856,-1.384443,1.484154,-0.453218,-0.26496,0.107592,0.4115,0.664277,-0.979104,-0.46118,-0.584777,0.450848
4,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,-0.787823


<span style="font-size: 30px; color: green">Data Splitting</span>

In [8]:
X = data.drop(["quality"], axis=1)
y = data["quality"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<span style="font-size: 30px; color: green">Model Training</span>

In [9]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "support Vector Regression": SVR(),
    "Decision Tree Regression": DecisionTreeRegressor(),
    "Random Forest Regression": RandomForestRegressor()
}

def evaluate_model(model, features, target):
    predictions = model.predict(features)
    mse = mean_squared_error(target, predictions)
    rmse = mse ** 0.5
    mae = mean_absolute_error(target, predictions)
    r2 = r2_score(target, predictions)
    return mse, rmse, mae, r2

for name, model in models.items():
    model.fit(X_train, y_train)
    mse, rmse, mae, r2 = evaluate_model(model, X_test, y_test)
    
    print(f"{name}:")
    print(f"   R^2 Score: {r2:.4f}")
    print(f"   MSE: {mse:.4f}")
    print(f"   RMSE: {rmse:.4f}")
    print(f"   MAE: {mae:.4f}")

# MSE closer to 0 means perfect prediction
# MAE Closer to 0 means perfect prediction of the model
# R Squared when score is 1, it indicates that the model's predictions perfectly match the actual values in the dataset

Linear Regression:
   R^2 Score: 0.4032
   MSE: 0.5984
   RMSE: 0.7736
   MAE: 0.6237
Ridge Regression:
   R^2 Score: 0.4032
   MSE: 0.5984
   RMSE: 0.7736
   MAE: 0.6237
Lasso Regression:
   R^2 Score: -0.0056
   MSE: 1.0083
   RMSE: 1.0041
   MAE: 0.8488
support Vector Regression:
   R^2 Score: 0.4584
   MSE: 0.5430
   RMSE: 0.7369
   MAE: 0.5640
Decision Tree Regression:
   R^2 Score: 0.0054
   MSE: 0.9973
   RMSE: 0.9986
   MAE: 0.5806
Random Forest Regression:
   R^2 Score: 0.5182
   MSE: 0.4831
   RMSE: 0.6951
   MAE: 0.5378


<span style="font-size: 30px; color: green">Model Tuning</span>

In [10]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the Random Forest Regressor
rf_model = RandomForestRegressor()

# Grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the best model
best_rf_model = grid_search.best_estimator_
best_rf_model_score = best_rf_model.score(X_test, y_test)
print("Best Model R^2 Score:", best_rf_model_score)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Model R^2 Score: 0.5178847263109134


<span style="font-size: 30px; color: green">Model Evaluation</span>

In [11]:
best_model = RandomForestRegressor(n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_depth=None)

# Fit the model
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation scores
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 0.5461131609013352
Mean Squared Error: 0.49347043668225954
Root Mean Squared Error: 0.702474509631673
R-squared: 0.5078474759045328


<span style="font-size: 30px; color: green">Saving the model</span>

In [12]:
joblib.dump(best_model, "red_best_model.pkl")

['red_best_model.pkl']