In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [2]:
red = pd.read_csv('data/red_cleaned_final.csv')
white = pd.read_csv('data/white_cleaned_final.csv')

In [3]:
red = red.drop(['Unnamed: 0'], axis = 1)
white = white.drop(['Unnamed: 0'], axis = 1)

# X - y Split

In [4]:
X_red = red.drop('quality', axis=1)
y_red = red.quality

X_white = white.drop('quality', axis=1)
y_white = white.quality

# Train - Test Split

In [5]:
X_red_train, X_red_test, y_red_train, y_red_test = train_test_split(X_red, y_red, test_size=0.3, random_state=100)
X_red_train = pd.DataFrame(X_red_train)
X_red_test = pd.DataFrame(X_red_test)

X_white_train, X_white_test, y_white_train, y_white_test = train_test_split(X_white, y_white, test_size=0.3, random_state=100)
X_white_train = pd.DataFrame(X_white_train)
X_white_test = pd.DataFrame(X_white_test)

# Transform and scale

## Powertransformer and Standard Scaler

## X_train

In [6]:
pt = PowerTransformer(method='yeo-johnson', standardize=True)
ss = StandardScaler()

In [7]:
# transform X_train
X_red_train_pt_f_t = pt.fit_transform(X_red_train)
X_red_train_ss_f_t = ss.fit_transform(X_red_train)

X_white_train_pt_f_t = pt.fit_transform(X_white_train)

In [8]:
# replace columns X_train for red wine
X_red_train_pt_f_t[6] = X_red_train_ss_f_t[6]
X_red_train_pt_f_t[7] = X_red_train_ss_f_t[7]

In [9]:
# make a dataframe for X_train after replacing columns
X_red_train = pd.DataFrame(X_red_train_pt_f_t)

X_white_train = pd.DataFrame(X_white_train_pt_f_t)

## X_test

In [10]:
# transform X_test
X_red_test_pt_t = pt.transform(X_red_test)
X_red_test_ss_t = ss.transform(X_red_test)

X_white_test_pt_t = pt.transform(X_white_test)

Feature names must be in the same order as they were in fit.



In [11]:
# replace columns X_test
X_red_test_pt_t[6] = X_red_test_ss_t[6]
X_red_test_pt_t[7] = X_red_test_ss_t[7]

In [12]:
# make a dataframe for X_train after replacing columns
X_red_test = pd.DataFrame(X_red_test_pt_t)

X_white_test = pd.DataFrame(X_white_test_pt_t)

In [13]:
# rename the columns
def rename_columns(x):
    x.rename(columns={0:'fixed_acidity',1:'volatile_acidity', 2:"citric_acid", 3:'residual_sugar', 4: 'chlorides', 5:"free_sulfur_dioxide", 6:"total_sulfur_dioxide", 7:'density', 8:'ph', 9:'sulphates', 10:'alcohol'}, inplace=True )
    return x


rename_columns(X_red_train)
rename_columns(X_red_test)


rename_columns(X_white_train)
rename_columns(X_white_test)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
0,0.254240,-0.666072,-0.129363,0.396020,0.351910,-0.236444,-0.638189,4.857226e-17,0.478087,0.237044,0.279754
1,-1.018817,-0.813096,0.056662,-1.369254,-0.283638,0.709513,-0.196987,-6.730727e-16,0.789431,-0.250138,0.879700
2,-0.882033,-2.131060,-0.711345,-0.292686,-0.188313,-1.747916,-1.686359,-5.551115e-17,2.063334,0.975378,-0.326867
3,0.254240,0.624005,-0.319275,0.676918,-1.797673,0.139925,-0.244964,-4.336809e-16,-0.256638,1.386898,1.632489
4,1.143198,0.216040,1.419946,-1.228239,0.351910,-0.369272,0.178485,-1.387779e-17,-0.688708,0.975378,-0.234900
...,...,...,...,...,...,...,...,...,...,...,...
1134,-1.018817,-2.672051,-0.812026,-1.369254,-0.094543,-1.278558,-1.803421,-2.012279e-16,2.405434,1.118957,-0.144787
1135,-1.739520,-1.291650,-0.913813,-0.434807,-0.283638,-1.747916,-0.638189,-2.567391e-16,0.604241,0.237044,-0.420744
1136,0.711333,-0.012120,0.933013,0.396020,-2.170552,-0.721350,-0.220945,-1.804112e-16,-0.398084,0.667008,0.809487
1137,1.651249,-0.813096,1.419946,-0.982829,-0.188313,0.079268,0.782793,2.775558e-17,-0.990044,-1.174412,-0.326867
