# ***Multiple Regression Analysis***

---
# Red Wine Quality Prediction by Zee

# **Import Library**

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn as sk
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split



# **Read Dataset**

In [5]:
#url = 'https://raw.githubusercontent.com/Lixxxuan/winequality/main/winequality-red.csv'
url = 'https://raw.githubusercontent.com/Lixxxuan/winequality/main/winequality-white.csv'
df = pd.read_csv(url)
# Random sample
df.sample()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
204,5.8,0.28,0.35,2.3,0.053,36.0,114.0,0.9924,3.28,0.5,10.2,4


# **Rename Columns**

In [None]:
df = df.rename(columns={'fixed acidity': 'Fixed Acidity', 'volatile acidity': 'Volatile Acidity', 'citric acid': 'Citric Acid',
                        'residual sugar': 'Residual Sugar', 'chlorides': 'Chlorides', 'free sulfur dioxide': 'Free Sulfur Dioxide', 'total sulfur dioxide':'Total Sulfur Dioxide',
                        'density': 'Density', 'pH': 'pH', 'sulphates': 'Sulphates', 'alcohol': 'Alcohol', 'quality': 'Quality'})
df

# **Checking Null Value of Dataset**

In [None]:
df.info()
df.isnull().any()

# **Exploratory Data Analysis (EDA)**

In [None]:
# Univariate Analysis
df.describe()

In [None]:
plt.figure(figsize=(30, 15))
sns.boxplot(data=df)

In [None]:
plt.figure(figsize=(30, 15))
zoom = df[['Fixed Acidity', 'Volatile Acidity', 'Citric Acid', 'Residual Sugar', 'Chlorides', 'Density', 'pH', 'Sulphates', 'Alcohol', 'Quality']]
sns.boxplot(data=zoom)

In [None]:
# Multivariate Analysis
plt.figure(figsize=(30, 15))
sns.pairplot(data=df, hue='Quality')
plt.show()

# **Heatmap**

In [None]:
# Heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(df.corr(), vmin=-1, vmax=1, cmap="coolwarm", annot=True)
plt.show()

In [None]:
# Heatmap other display
# Creates an array of zeros
zero = np.zeros_like(df.corr())
triangle_indices = np.triu_indices_from(zero)
zero[triangle_indices] = True
# Check the relationship between all the features with the target (Quality)
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(), mask=zero, cmap="coolwarm", annot=True, annot_kws={'size': 14})
sns.set_style('whitegrid')
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()
# In order of highest correlation (to Quality): Alcohol, Volatile Acidity, Sulphates, Citric Acid, Total Sulfur Dioxide, Density, Chlorides, Fixed Acidity, pH, Free Sulfur Dioxide, Residual Sugar

# **Independent Variables & Dependent Variables**

In [None]:
# Use this code
# x = df[['Fixed Acidity', 'Volatile Acidity', 'Citric Acid', 'Residual Sugar', 'Chlorides', 'Free Sulfur Dioxide', 'Total Sulfur Dioxide', 'Density', 'pH', 'Sulphates', 'Alcohol']]
# y = df[['Quality']]
# Or this
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
# Splitting dataset into training set & test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
# Fitting Multiple Linear Regression to the training set
regr = LinearRegression()
regr.fit(x_train, y_train)

# **Prediction**

In [None]:
# Test prediction (From Fixed Acidity, Volatile Acidity, Citric Acid, Residual Sugar, Chlorides, Free Sulfur Dioxide, Total Sulfur Dioxide, Density, pH, Sulphates, Alcohol)
print(regr.predict([[15, 0.01, 0, 5, 0.001, 30, 50, 0.95, 3, 0.9, 15]]))
# Quality: 9.82988592 (9 out of 10)

# **Backward Elimination (Use this code if it's necessary)**

In [None]:
# Building the optimal model using Backward Elimination
x = np.append(arr = np.ones((1599, 1)).astype(int), values = x, axis = 1)
# Choose a Significance level usually 0.05, if p > 0.05
# For the highest values parameter, remove that value
x_opt = x[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]
ols = sm.OLS(endog = y, exog = x_opt).fit()
ols.summary()

In [None]:
# Deleted x3 (0.755) [Residual Sugar]
x_opt = x[:, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]]
ols = sm.OLS(endog = y, exog = x_opt).fit()
ols.summary()

In [None]:
# Deleted x5 (0.125) [Total Sulfur Dioxide]
x_opt = x[:, [0, 1, 2, 4, 5, 7, 8, 9, 10]]
ols = sm.OLS(endog = y, exog = x_opt).fit()
ols.summary()
# This means that Alcohol, Volatile Acidity, Sulphates, Citric Acid, Density, Chlorides, Fixed Acidity, pH, & Free Sulfur Dioxide having the highest impact on the quality of Red Wine

# **Evaluation**

In [None]:
# Ordinary Least Square (OLS) Principle
x = sm.add_constant(x_train)
model = sm.OLS(y_train, x).fit()
print(model.summary())

In [None]:
# Evaluate our model using MSE criterion
y_pred = regr.predict(x_test)
print("Mean Squared Error (MSE): ", mean_squared_error(y_test, y_pred))

In [None]:
# Check quality value from dataset that is listed
print(df['Quality'].unique())

In [None]:
# MAPE
def mape(actual, pred):
  actual, pred = np.array(actual), np.array(pred)
  return np.mean(np.abs((actual - pred) / actual)) * 100

In [None]:
mape(y_test, y_pred)
# Error of our model is approximately 8-9% (Excellent)
# MAPE result Notes : 
# 1. Below 10% = Excellent
# 2. 10% - 20% = Good
# 3. 21% - 50% = Reasonable
# 4. Above 50% = Inaccurate