<span style="font-size: 40px; color: red">Wine Quality Prediction using regression model</span>

<span style="font-size: 40px; color: orange">Objective:</span>
* Develop a machine learning model that predicts the quality of wine based on its chemical attributes.

* Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal.
* The goal is to model wine quality based on physicochemical tests
(see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).



<span style="font-size: 30px; color: green">Import Libraries</span>

In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from sklearn.feature_selection import RFECV

<span style="font-size: 30px; color: green">Load the Datasets</span>

In [2]:
# Red wine dataset
red_df = pd.read_csv("winequality-red.csv", sep=";")

# white Wine dataset
white_df = pd.read_csv("winequality-white.csv", sep=";")

<span style="font-size: 30px; color: green">Data Preprocessing</span>

<span style="font-size: 20px; color: blue">Handling Missing data</span>

In [3]:
red_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
white_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


<span style="font-size: 20px; color: blue">Feature Scaling</span>

In [5]:
scaler = StandardScaler()
# red wine
red_df = pd.DataFrame(scaler.fit_transform(red_df), columns=red_df.columns)
# white wine
white_df = pd.DataFrame(scaler.fit_transform(white_df), columns=white_df.columns)

<span style="font-size: 30px; color: green">Data Splitting</span>

In [6]:
red_df.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,-0.787823
1,-0.298547,1.967442,-1.391472,0.043416,0.223875,0.872638,0.624363,0.028261,-0.719933,0.12895,-0.584777,-0.787823
2,-0.298547,1.297065,-1.18607,-0.169427,0.096353,-0.083669,0.229047,0.134264,-0.331177,-0.048089,-0.584777,-0.787823


In [7]:
# Red wine
X_red = red_df.drop(["quality"], axis=1)
y_red = red_df["quality"]
X_trainr, X_testr, y_trainr, y_testr = train_test_split(X_red, y_red, test_size=0.2, random_state=42)

In [8]:
# White wine
X_wh = white_df.drop(["quality"], axis=1)
y_wh = white_df["quality"]
X_trainw, X_testw, y_trainw, y_testw = train_test_split(X_wh, y_wh, test_size=0.2, random_state=42)

<span style="font-size: 30px; color: green">Model Training and Evaluation</span>

In [9]:
# Red wine
r_models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "support Vector Regression": SVR(),
    "Decision Tree Regression": DecisionTreeRegressor(),
    "Random Forest Regression": RandomForestRegressor()
}

def evaluate_model(model, features, target):
    predictions = model.predict(features)
    mse = mean_squared_error(target, predictions)
    rmse = mse ** 0.5
    mae = mean_absolute_error(target, predictions)
    r2 = r2_score(target, predictions)
    return mse, rmse, mae, r2

for name, model in r_models.items():
    model.fit(X_trainr, y_trainr)
    mse, rmse, mae, r2 = evaluate_model(model, X_testr, y_testr)
    
    print(f"{name}:")
    print(f"   R^2 Score: {r2:.4f}")
    print(f"   MSE: {mse:.4f}")
    print(f"   RMSE: {rmse:.4f}")
    print(f"   MAE: {mae:.4f}")

# MSE closer to 0 means perfect prediction
# MAE Closer to 0 means perfect prediction of the model
# R Squared when score is 1, it indicates that the model's predictions perfectly match the actual values in the dataset

Linear Regression:
   R^2 Score: 0.4032
   MSE: 0.5984
   RMSE: 0.7736
   MAE: 0.6237
Ridge Regression:
   R^2 Score: 0.4032
   MSE: 0.5984
   RMSE: 0.7736
   MAE: 0.6237
Lasso Regression:
   R^2 Score: -0.0056
   MSE: 1.0083
   RMSE: 1.0041
   MAE: 0.8488
support Vector Regression:
   R^2 Score: 0.4584
   MSE: 0.5430
   RMSE: 0.7369
   MAE: 0.5640
Decision Tree Regression:
   R^2 Score: 0.0532
   MSE: 0.9494
   RMSE: 0.9743
   MAE: 0.5729
Random Forest Regression:
   R^2 Score: 0.5289
   MSE: 0.4724
   RMSE: 0.6873
   MAE: 0.5328


In [10]:
# White wine
w_models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "support Vector Regression": SVR(),
    "Decision Tree Regression": DecisionTreeRegressor(),
    "Random Forest Regression": RandomForestRegressor()
}

def evaluate_model(model, features, target):
    predictions = model.predict(features)
    mse = mean_squared_error(target, predictions)
    rmse = mse ** 0.5
    mae = mean_absolute_error(target, predictions)
    r2 = r2_score(target, predictions)
    return mse, rmse, mae, r2

for name, model in w_models.items():
    model.fit(X_trainw, y_trainw)
    mse, rmse, mae, r2 = evaluate_model(model, X_testw, y_testw)
    
    print(f"{name}:")
    print(f"   R^2 Score: {r2:.4f}")
    print(f"   MSE: {mse:.4f}")
    print(f"   RMSE: {rmse:.4f}")
    print(f"   MAE: {mae:.4f}")

Linear Regression:
   R^2 Score: 0.2653
   MSE: 0.7256
   RMSE: 0.8518
   MAE: 0.6620
Ridge Regression:
   R^2 Score: 0.2652
   MSE: 0.7257
   RMSE: 0.8519
   MAE: 0.6621
Lasso Regression:
   R^2 Score: -0.0014
   MSE: 0.9890
   RMSE: 0.9945
   MAE: 0.7622
support Vector Regression:
   R^2 Score: 0.3900
   MSE: 0.6024
   RMSE: 0.7761
   MAE: 0.5832
Decision Tree Regression:
   R^2 Score: 0.1396
   MSE: 0.8497
   RMSE: 0.9218
   MAE: 0.5519
Random Forest Regression:
   R^2 Score: 0.5534
   MSE: 0.4411
   RMSE: 0.6641
   MAE: 0.4698


<span style="font-size: 30px; color: green">Model Tuning</span>

In [11]:
# Random Forest Regression

# Grid Search for Red Wine

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the Random Forest Regressor
rf_model = RandomForestRegressor()

# Grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_trainr, y_trainr)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the best model
best_rf_model = grid_search.best_estimator_
best_rf_model_score = best_rf_model.score(X_testr, y_testr)
print("Best Model R^2 Score:", best_rf_model_score)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Model R^2 Score: 0.530580433060865


In [13]:
# Random Forest Regression

# Randomised Search for White wine

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

# Randomized Search with cross-validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, cv=5, scoring='neg_mean_squared_error', n_iter=10, random_state=42)
random_search.fit(X_trainw, y_trainw)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the best model
best_rf_model = random_search.best_estimator_
best_rf_model_score = best_rf_model.score(X_testw, y_testw)
print("Best Model R^2 Score:", best_rf_model_score)

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
Best Model R^2 Score: 0.5520738958112622


<span style="font-size: 20px; color: blue">Feature Selection</span>

In [16]:
# Instantiate the best model for Red Wine

best_red_model = RandomForestRegressor(n_estimators=300, min_samples_split=2, min_samples_leaf=1, max_depth=None)

selector = RFECV(best_red_model, step=1, cv=5, scoring='accuracy')
selector.fit(X_trainr, y_trainr)

# Get the selected features and their ranks
red_sfeatures = X_trainr.columns[selctor.support_]
feature_ranks = selector.ranking_

print(f"Selected features: {red_sfeatures}")
print(f"Feature ranks: {feature_ranks}")

ValueError: continuous is not supported

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score

# Instantiate the best model from Step 4 (e.g., Random Forests)
best_model = RandomForestClassifier(n_estimators=200, max_depth=30, min_samples_split=10, min_samples_leaf=2)

# Create the RFECV object and fit it to the training data
selector = RFECV(best_model, step=1, cv=5, scoring='accuracy')
selector.fit(X_train, y_train)

# Get the selected features and their ranks
selected_features = X_train.columns[selector.support_]
feature_ranks = selector.ranking_

print(f"Selected features: {selected_features}")
print(f"Feature ranks: {feature_ranks}")

<span style="font-size: 30px; color: green">Saving the model</span>