In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [2]:
red = pd.read_csv('data/red_cleaned_final.csv')
white = pd.read_csv('data/white_cleaned_final.csv')

In [3]:
red = red.drop(['Unnamed: 0'], axis = 1)
white = white.drop(['Unnamed: 0'], axis = 1)

# X - y Split

In [4]:
X_red = red.drop('quality', axis=1)
y_red = red.quality

X_white = white.drop('quality', axis=1)
y_white = white.quality

# Train - Test Split

In [5]:
X_red_train, X_red_test, y_red_train, y_red_test = train_test_split(X_red, y_red, test_size=0.3, random_state=100)
X_red_train = pd.DataFrame(X_red_train)
X_red_test = pd.DataFrame(X_red_test)

X_white_train, X_white_test, y_white_train, y_white_test = train_test_split(X_white, y_white, test_size=0.3, random_state=100)
X_white_train = pd.DataFrame(X_white_train)
X_white_test = pd.DataFrame(X_white_test)

# Transform and scale

## Powertransformer and Standard Scaler

## X_train

In [6]:
pt = PowerTransformer(method='yeo-johnson', standardize=True)
ss = StandardScaler()

In [7]:
# transform X_train
X_red_train_pt_f_t = pt.fit_transform(X_red_train)
X_red_train_ss_f_t = ss.fit_transform(X_red_train)

X_white_train_pt_f_t = pt.fit_transform(X_white_train)

In [8]:
# replace columns X_train for red wine
X_red_train_pt_f_t[6] = X_red_train_ss_f_t[6]
X_red_train_pt_f_t[7] = X_red_train_ss_f_t[7]

In [9]:
# make a dataframe for X_train after replacing columns
X_red_train = pd.DataFrame(X_red_train_pt_f_t)

X_white_train = pd.DataFrame(X_white_train_pt_f_t)

## X_test

In [10]:
# transform X_test
X_red_test_pt_t = pt.transform(X_red_test)
X_red_test_ss_t = ss.transform(X_red_test)

X_white_test_pt_t = pt.transform(X_white_test)

Feature names must be in the same order as they were in fit.



In [11]:
# replace columns X_test
X_red_test_pt_t[6] = X_red_test_ss_t[6]
X_red_test_pt_t[7] = X_red_test_ss_t[7]

In [12]:
# make a dataframe for X_train after replacing columns
X_red_test = pd.DataFrame(X_red_test_pt_t)

X_white_test = pd.DataFrame(X_white_test_pt_t)

In [13]:
# rename the columns
def rename_columns(x):
    x.rename(columns={0:'fixed_acidity',1:'volatile_acidity', 2:"citric_acid", 3:'residual_sugar', 4: 'chlorides', 5:"free_sulfur_dioxide", 6:"total_sulfur_dioxide", 7:'density', 8:'ph', 9:'sulphates', 10:'alcohol'}, inplace=True )
    return x


rename_columns(X_red_train)
rename_columns(X_red_test)


rename_columns(X_white_train)
rename_columns(X_white_test)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
0,0.254240,-0.666072,-0.129363,0.396020,0.351910,-0.236444,-0.638189,4.857226e-17,0.478087,0.237044,0.279754
1,-1.018817,-0.813096,0.056662,-1.369254,-0.283638,0.709513,-0.196987,-6.730727e-16,0.789431,-0.250138,0.879700
2,-0.882033,-2.131060,-0.711345,-0.292686,-0.188313,-1.747916,-1.686359,-5.551115e-17,2.063334,0.975378,-0.326867
3,0.254240,0.624005,-0.319275,0.676918,-1.797673,0.139925,-0.244964,-4.336809e-16,-0.256638,1.386898,1.632489
4,1.143198,0.216040,1.419946,-1.228239,0.351910,-0.369272,0.178485,-1.387779e-17,-0.688708,0.975378,-0.234900
...,...,...,...,...,...,...,...,...,...,...,...
1134,-1.018817,-2.672051,-0.812026,-1.369254,-0.094543,-1.278558,-1.803421,-2.012279e-16,2.405434,1.118957,-0.144787
1135,-1.739520,-1.291650,-0.913813,-0.434807,-0.283638,-1.747916,-0.638189,-2.567391e-16,0.604241,0.237044,-0.420744
1136,0.711333,-0.012120,0.933013,0.396020,-2.170552,-0.721350,-0.220945,-1.804112e-16,-0.398084,0.667008,0.809487
1137,1.651249,-0.813096,1.419946,-0.982829,-0.188313,0.079268,0.782793,2.775558e-17,-0.990044,-1.174412,-0.326867


In [14]:
X_red_train.round(decimals=3)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
0,-0.171,1.532,0.644,-0.260,2.157,1.334,-0.236,0.469,-0.035,-1.459,0.540
1,-0.374,0.076,0.027,-1.205,-0.130,-0.841,-1.034,-0.339,0.134,0.725,1.256
2,-0.237,0.620,1.678,0.306,1.027,0.304,0.626,0.204,-0.938,-1.257,-0.962
3,0.637,-0.280,0.644,0.657,0.493,1.099,1.394,-1.115,-1.176,-0.122,-1.117
4,2.047,0.594,2.362,0.253,-0.894,0.204,3.000,-1.703,0.783,1.804,-0.283
...,...,...,...,...,...,...,...,...,...,...,...
884,-0.042,0.410,-0.375,1.786,-1.923,-2.194,-1.143,-0.617,-0.405,-0.513,0.860
885,-0.374,0.076,-0.607,0.359,-0.130,-0.059,0.084,0.469,-0.035,-0.513,-0.962
886,2.769,0.944,1.620,1.058,-1.098,0.238,3.537,-2.623,0.444,2.166,0.860
887,-0.899,1.359,1.090,1.140,-0.712,-0.649,-0.078,0.923,-0.608,-1.528,0.452


In [16]:
X_red_test.round(decimals=3)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
0,4.877,0.893,10.329,-2.442,5.919,1.883,-4.412,0.0,-100.408,0.144,-0.056
1,-0.231,2.247,9.294,-2.410,5.919,1.610,-4.412,0.0,-113.336,-9.973,0.738
2,2.917,1.420,9.008,-2.450,5.919,-1.193,-4.412,0.0,-102.138,-0.148,1.338
3,4.349,1.548,9.008,-2.473,5.919,-1.279,-4.412,0.0,-115.349,1.322,-0.235
4,0.136,0.976,9.832,-2.413,5.919,-1.193,-4.412,0.0,-130.779,-3.008,-0.327
...,...,...,...,...,...,...,...,...,...,...,...
376,0.822,1.420,10.563,-2.461,5.919,-0.506,-4.412,0.0,-177.286,-1.863,-0.056
377,1.350,2.951,8.709,-2.440,5.919,1.610,-4.412,0.0,-163.891,-2.656,-0.614
378,0.486,1.210,8.068,-2.464,5.919,1.468,-4.412,0.0,-95.431,-1.863,-0.517
379,3.162,1.781,10.563,-2.407,5.919,-0.044,-4.412,0.0,-140.778,-3.191,-0.421


# Model

In [17]:
X_red_train_ct = sm.add_constant(X_red_train.to_numpy()) # adding a constant 
X_red_test_ct = sm.add_constant(X_red_test.to_numpy())

In [21]:
model_red = sm.OLS(y_red_train, X_red_train_ct).fit()

In [22]:
pred_red_train = model_red.predict(X_red_train_ct)

In [23]:
pred_red_test = model_red.predict(X_red_test_ct)

In [24]:
print_model = model_red.summary()

In [25]:
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                quality   R-squared:                       0.384
Model:                            OLS   Adj. R-squared:                  0.377
Method:                 Least Squares   F-statistic:                     49.76
Date:                Tue, 18 Apr 2023   Prob (F-statistic):           9.10e-85
Time:                        16:35:02   Log-Likelihood:                -881.87
No. Observations:                 889   AIC:                             1788.
Df Residuals:                     877   BIC:                             1845.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.6321      0.022    255.624      0.0

In [20]:
lr = LinearRegression()
lr.fit(X_red_train, y_red_train)

LinearRegression()

In [26]:
lr.coef_

array([ 0.0971995 , -0.1930177 ,  0.03206899, -0.05098525,  0.00625452,
       -0.0338283 , -0.1304828 , -0.06962065,  0.19202102, -0.08048368,
        0.26665231])

In [27]:
lr.intercept_

5.6321457225721225