<span style="font-size: 40px; color: red">White Wine Quality Prediction using regression model</span>

<span style="font-size: 40px; color: orange">Objective:</span>
* Develop a machine learning model that predicts the quality of wine based on its chemical attributes.


<span style="font-size: 30px; color: green">Import Libraries</span>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from sklearn.feature_selection import RFECV

import joblib

<span style="font-size: 30px; color: green">Load the Dataset</span>

In [2]:
data = pd.read_csv("winequality-white.csv", sep=";")

<span style="font-size: 30px; color: green">Data Preprocessing</span>

<span style="font-size: 20px; color: blue">Handling Missing data</span>

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


<span style="font-size: 20px; color: blue">Feature Scaling</span>

In [4]:
scaler = StandardScaler()

data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [5]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.172097,-0.08177,0.21328,2.821349,-0.035355,0.569932,0.744565,2.331512,-1.246921,-0.349184,-1.393152,0.13787
1,-0.657501,0.215896,0.048001,-0.944765,0.147747,-1.253019,-0.149685,-0.009154,0.740029,0.001342,-0.824276,0.13787
2,1.475751,0.017452,0.543838,0.100282,0.193523,-0.312141,-0.973336,0.358665,0.475102,-0.436816,-0.336667,0.13787
3,0.409125,-0.478657,-0.117278,0.415768,0.559727,0.687541,1.121091,0.525855,0.01148,-0.787342,-0.499203,0.13787
4,0.409125,-0.478657,-0.117278,0.415768,0.559727,0.687541,1.121091,0.525855,0.01148,-0.787342,-0.499203,0.13787


<span style="font-size: 30px; color: green">Data Splitting</span>

In [6]:
X = data.drop(["quality"], axis=1)
y = data["quality"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<span style="font-size: 30px; color: green">Model Training</span>

In [7]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "support Vector Regression": SVR(),
    "Decision Tree Regression": DecisionTreeRegressor(),
    "Random Forest Regression": RandomForestRegressor()
}

def evaluate_model(model, features, target):
    predictions = model.predict(features)
    mse = mean_squared_error(target, predictions)
    rmse = mse ** 0.5
    mae = mean_absolute_error(target, predictions)
    r2 = r2_score(target, predictions)
    return mse, rmse, mae, r2

for name, model in models.items():
    model.fit(X_train, y_train)
    mse, rmse, mae, r2 = evaluate_model(model, X_test, y_test)
    
    print(f"{name}:")
    print(f"   R^2 Score: {r2:.4f}")
    print(f"   MSE: {mse:.4f}")
    print(f"   RMSE: {rmse:.4f}")
    print(f"   MAE: {mae:.4f}")

Linear Regression:
   R^2 Score: 0.2653
   MSE: 0.7256
   RMSE: 0.8518
   MAE: 0.6620
Ridge Regression:
   R^2 Score: 0.2652
   MSE: 0.7257
   RMSE: 0.8519
   MAE: 0.6621
Lasso Regression:
   R^2 Score: -0.0014
   MSE: 0.9890
   RMSE: 0.9945
   MAE: 0.7622
support Vector Regression:
   R^2 Score: 0.3900
   MSE: 0.6024
   RMSE: 0.7761
   MAE: 0.5832
Decision Tree Regression:
   R^2 Score: 0.0777
   MSE: 0.9109
   RMSE: 0.9544
   MAE: 0.5646
Random Forest Regression:
   R^2 Score: 0.5563
   MSE: 0.4382
   RMSE: 0.6620
   MAE: 0.4695


<span style="font-size: 30px; color: green">Model Tuning</span>

In [8]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the Random Forest Regressor
rf_model = RandomForestRegressor()

# Grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the best model
best_rf_model = grid_search.best_estimator_
best_rf_model_score = best_rf_model.score(X_test, y_test)
print("Best Model R^2 Score:", best_rf_model_score)

Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Model R^2 Score: 0.5583264520980464


<span style="font-size: 30px; color: green">Model Evaluation</span>

In [9]:
best_model = RandomForestRegressor(n_estimators=300, min_samples_split=2, min_samples_leaf=1, max_depth=20)

# Fit the model
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation scores
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 0.4737882718192688
Mean Squared Error: 0.44034798959644994
Root Mean Squared Error: 0.663587213255688
R-squared: 0.5541240662030628


<span style="font-size: 30px; color: green">Saving the model</span>

In [10]:
joblib.dump(best_model, "white_best_model.pkl")

['white_best_model.pkl']