In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [2]:
red = pd.read_csv('data/red_cleaned_final.csv')
white = pd.read_csv('data/white_cleaned_final.csv')

In [3]:
red = red.drop(['Unnamed: 0'], axis = 1)
white = white.drop(['Unnamed: 0'], axis = 1)

# X - y Split

In [4]:
X_red = red.drop('quality', axis=1)
y_red = red.quality

X_white = white.drop('quality', axis=1)
y_white = white.quality

# Train - Test Split

In [5]:
X_red_train, X_red_test, y_red_train, y_red_test = train_test_split(X_red, y_red, test_size=0.3, random_state=100)
X_red_train = pd.DataFrame(X_red_train)
X_red_test = pd.DataFrame(X_red_test)

In [None]:
X_white_train, X_white_test, y_white_train, y_white_test = train_test_split(X_white, y_white, test_size=0.3, random_state=100)
X_white_train = pd.DataFrame(X_white_train)
X_white_test = pd.DataFrame(X_white_test)

# Transform and scale

## Powertransformer and Standard Scaler

## X_train

In [6]:
pt = PowerTransformer(method='yeo-johnson', standardize=True)
ss = StandardScaler()

In [7]:
# transform X_train
X_red_train_pt_f_t = pt.fit_transform(X_red_train)

In [None]:
X_white_train_pt_f_t = pt.fit_transform(X_white_train)

In [8]:
# make a dataframe for X_train after replacing columns
X_red_train = pd.DataFrame(X_red_train_pt_f_t)

In [None]:
X_white_train = pd.DataFrame(X_white_train_pt_f_t)

# y_train

In [9]:
y_red_train = pd.DataFrame(y_red_train)

In [10]:
y_red_train_pt_f_t = pt.fit_transform(y_red_train)

In [24]:
y_red_train = pd.DataFrame(y_red_train_pt_f_t)

In [25]:
y_red_train

Unnamed: 0,quality
0,-0.748486
1,0.465427
2,-0.748486
3,-0.748486
4,1.602111
...,...
884,0.465427
885,1.602111
886,1.602111
887,-0.748486


## X_test

In [None]:
# transform X_test
X_red_test_pt_t = pt.transform(X_red_test)

In [None]:
X_white_test_pt_t = pt.transform(X_white_test)

In [None]:
# make a dataframe for X_train after replacing columns
X_red_test = pd.DataFrame(X_red_test_pt_t)

In [None]:
X_white_test = pd.DataFrame(X_white_test_pt_t)

# y_test

In [None]:
y_red_test = pd.DataFrame(y_red_test)

In [None]:
y_red_test_pt_f_t = pt.fit_transform(y_red_test)

## Rename columns

In [11]:
def rename_columns(x):
    x.rename(columns={0:'fixed_acidity',1:'volatile_acidity', 2:"citric_acid", 3:'residual_sugar', 4: 'chlorides', 5:"free_sulfur_dioxide", 6:"total_sulfur_dioxide", 7:'density', 8:'ph', 9:'sulphates', 10:'alcohol'}, inplace=True )
    return x

In [19]:
def rename_columns_y(x):
    x.rename(columns={0:'quality'}, inplace=True )
    return x

In [12]:
rename_columns(X_red_train)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
0,-0.171108,1.532116,0.644498,-0.259667,2.156996,1.334259,-0.235780,0.469226,-0.034549,-1.458802,0.540050
1,-0.373911,0.076375,0.027364,-1.205047,-0.130326,-0.840992,-1.034269,-0.338509,0.134460,0.725316,1.256374
2,-0.237370,0.620289,1.677730,0.306365,1.026776,0.303805,0.625935,0.204339,-0.937600,-1.256574,-0.961858
3,0.637484,-0.280198,0.644498,0.656507,0.493257,1.098564,1.393805,-1.114835,-1.175525,-0.121991,-1.116999
4,2.047002,0.594459,2.362246,0.253390,-0.894472,0.203698,3.000014,-1.702509,0.783085,1.804242,-0.282828
...,...,...,...,...,...,...,...,...,...,...,...
884,-0.042397,0.409761,-0.374677,1.785561,-1.923265,-2.194028,-1.143042,-0.616698,-0.404953,-0.513463,0.860492
885,-0.373911,0.076375,-0.606628,0.358579,-0.130326,-0.058551,0.084213,0.469226,-0.034549,-0.513463,-0.961858
886,2.768638,0.944159,1.619879,1.057887,-1.098232,0.237787,3.537427,-2.623072,0.443773,2.165553,0.860492
887,-0.898842,1.359026,1.090177,1.140283,-0.712476,-0.648968,-0.078461,0.922703,-0.608019,-1.527844,0.451551


In [26]:
rename_columns_y(y_red_train)

Unnamed: 0,quality
0,-0.748486
1,0.465427
2,-0.748486
3,-0.748486
4,1.602111
...,...
884,0.465427
885,1.602111
886,1.602111
887,-0.748486


In [None]:
rename_columns(X_red_test)

In [None]:
rename_columns(X_white_train)
rename_columns(X_white_test)

# Round

In [13]:
X_red_train.round(decimals=3)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
0,-0.171,1.532,0.644,-0.260,2.157,1.334,-0.236,0.469,-0.035,-1.459,0.540
1,-0.374,0.076,0.027,-1.205,-0.130,-0.841,-1.034,-0.339,0.134,0.725,1.256
2,-0.237,0.620,1.678,0.306,1.027,0.304,0.626,0.204,-0.938,-1.257,-0.962
3,0.637,-0.280,0.644,0.657,0.493,1.099,1.394,-1.115,-1.176,-0.122,-1.117
4,2.047,0.594,2.362,0.253,-0.894,0.204,3.000,-1.703,0.783,1.804,-0.283
...,...,...,...,...,...,...,...,...,...,...,...
884,-0.042,0.410,-0.375,1.786,-1.923,-2.194,-1.143,-0.617,-0.405,-0.513,0.860
885,-0.374,0.076,-0.607,0.359,-0.130,-0.059,0.084,0.469,-0.035,-0.513,-0.962
886,2.769,0.944,1.620,1.058,-1.098,0.238,3.537,-2.623,0.444,2.166,0.860
887,-0.899,1.359,1.090,1.140,-0.712,-0.649,-0.078,0.923,-0.608,-1.528,0.452


In [27]:
y_red_train.round(decimals=3)

Unnamed: 0,quality
0,-0.748
1,0.465
2,-0.748
3,-0.748
4,1.602
...,...
884,0.465
885,1.602
886,1.602
887,-0.748


In [None]:
X_red_test.round(decimals=3)

# Model

In [28]:
X_red_train_ct = sm.add_constant(X_red_train.to_numpy()) # adding a constant 
X_red_test_ct = sm.add_constant(X_red_test.to_numpy())

In [29]:
model_red = sm.OLS(y_red_train, X_red_train_ct).fit()

In [30]:
pred_red_train = model_red.predict(X_red_train_ct)

In [31]:
pred_red_test = model_red.predict(X_red_test_ct)

In [32]:
print_model = model_red.summary()

In [33]:
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                quality   R-squared:                       0.377
Model:                            OLS   Adj. R-squared:                  0.370
Method:                 Least Squares   F-statistic:                     48.35
Date:                Tue, 18 Apr 2023   Prob (F-statistic):           1.04e-82
Time:                        17:32:08   Log-Likelihood:                -1050.7
No. Observations:                 889   AIC:                             2125.
Df Residuals:                     877   BIC:                             2183.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -8.041e-14      0.027  -3.02e-12      1.0

In [None]:
lr = LinearRegression()
lr.fit(X_red_train, y_red_train)

In [None]:
lr.coef_

In [None]:
lr.intercept_