### Import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression, LogisticRegression
from IPython.display import display, Markdown
from sklearn import metrics 
import statsmodels.api as sm
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

### Get the dataset

In [2]:
dataset = pd.read_csv('wine.csv', sep=';')

### Describe data

In [3]:
dataset.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0
mean,8.62225,0.519317,0.293108,2.563333,0.089226,15.243333,46.883333,0.997055,3.299175,0.665675,10.384833,5.665
std,1.784124,0.179246,0.196851,1.264527,0.04831,10.206171,33.935027,0.001882,0.157441,0.175861,1.093145,0.809313
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.3,0.39,0.12,1.9,0.071,7.0,21.0,0.996,3.1975,0.56,9.5,5.0
50%,8.3,0.5,0.285,2.2,0.08,13.0,38.0,0.997015,3.3,0.62,10.0,6.0
75%,9.6,0.63,0.45,2.7,0.092,21.0,63.0,0.998173,3.39,0.7325,11.0,6.0
max,15.9,1.33,1.0,15.5,0.611,68.0,289.0,1.0032,4.01,2.0,14.9,8.0


## 1.a Linear Regression

### Get the X and y

In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

### Split the training and testing sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Fitting model

In [6]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

### Predict

In [7]:
train_predict = regressor.predict(X_train)
test_predict = regressor.predict(X_test)

### Print out intercept coefficient and coefficient

In [8]:
print('\nThe linear regression equation: y = ')
print(round(regressor.intercept_, 4), end=' ')
co = list(regressor.coef_)
for i in range(len(co)):
    print(f'+ {round(co[i], 4)} * {dataset.columns[i]}', end=' ')


The linear regression equation: y = 
32.184 + 0.0638 * fixed acidity + -1.1851 * volatile acidity + -0.4266 * citric acid + 0.0326 * residual sugar + -1.4253 * chlorides + 0.0025 * free sulfur dioxide + -0.0032 * total sulfur dioxide + -29.523 * density + -0.0397 * pH + 0.7318 * sulphates + 0.2799 * alcohol 

In [9]:
print('CVScore:', cross_val_score(regressor, X, y, cv=10).mean()*100.0)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, test_predict))
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, test_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, test_predict)))

X_addC = sm.add_constant(X)
result = sm.OLS(y, X_addC).fit()
print('RSquared:', result.rsquared)
print('Adj RSquared:', result.rsquared_adj)
print('RSquared - Adj RSquared:', result.rsquared - result.rsquared_adj)

CVScore: 25.023743283113376
Mean Absolute Error: 0.5294533578492465
Root Mean Squared Error: 0.6671890058224141
RSquared: 0.3783638631118801
Adj RSquared: 0.3726079729555086
RSquared - Adj RSquared: 0.005755890156371457


## 1.b Cross Validation

In [10]:
from sklearn.model_selection import KFold 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score 
from sklearn import metrics 
import statsmodels.api as sm

### Get the intercept coeffiicient and coefficient

The cross_val_score() function will be used to perform the evaluation, taking the dataset and cross-validation configuration and returning a list of scores calculated for each fold.

So I use it to define which column will be the best to be a model.

In [11]:
CVS_scores = []

for col in range(11):
    print('\nCol:', dataset.columns[col])
    X = np.array(dataset.iloc[:, col]).reshape(-1, 1)
    y = dataset.iloc[:, -1]
    
#     scaler = MinMaxScaler(feature_range=(0, 1))
#     X = scaler.fit_transform(X)
    
    model = LinearRegression()
    CV = KFold(n_splits=10, random_state=42, shuffle=True)
    
    for train_idx, test_idx in CV.split(X):
        X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
        model.fit(X_train, y_train)
    
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    print('CVScore:', cross_val_score(model, X, y, cv = 10).mean()*100.0)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, test_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, test_pred)))
    
    X_addC = sm.add_constant(X)
    result = sm.OLS(y, X_addC).fit()
    print('RSquared:', result.rsquared)
    print('Adj RSquared:', result.rsquared_adj)
    
    CVS_scores.append((dataset.columns[col], cross_val_score(model, X, y, cv = 10).mean()*100.0, model.intercept_, model.coef_[0]))
    
# first element: name of column
# second element: CVscore
# third element: intercept coefficient of model
# fouth element: coefficient of model

CVS_scores.sort(key=lambda tup: tup[1], reverse=True)
print()
print('The column we need for this question:')
display(Markdown(f'$$ {CVS_scores[0][0]} $$'))
print('The linear regression equation:')
display(Markdown(f'$$ {round(CVS_scores[0][2], 4)} + {round(CVS_scores[0][3], 4)} * {CVS_scores[0][0]} $$'))


Col: fixed acidity
CVScore: -14.226511972310002
Mean Absolute Error: 0.6250531809397132
Root Mean Squared Error: 0.7646037309998936
RSquared: 0.021924336795397514
Adj RSquared: 0.02110791303646209

Col: volatile acidity
CVScore: 0.9751672117924538
Mean Absolute Error: 0.5754427943009268
Root Mean Squared Error: 0.7247186617306015
RSquared: 0.14378690189385024
Adj RSquared: 0.14307219980861974

Col: citric acid
CVScore: -10.482005234790941
Mean Absolute Error: 0.6119285595377607
Root Mean Squared Error: 0.752539523414577
RSquared: 0.05173961897926804
Adj RSquared: 0.050948082768065395

Col: residual sugar
CVScore: -16.4323483298996
Mean Absolute Error: 0.6502538141809695
Root Mean Squared Error: 0.7743427792497886
RSquared: 0.002363951618359561
Adj RSquared: 0.001531200325887383

Col: chlorides
CVScore: -14.876868994712886
Mean Absolute Error: 0.6394283987542668
Root Mean Squared Error: 0.772407042272651
RSquared: 0.015563058669082475
Adj RSquared: 0.014741324995183547

Col: free sulfu

$$ alcohol $$

The linear regression equation:


$$ 1.8412 + 0.3689 * alcohol $$

### 1.c Build your own model

### Correlation
Taking features with correlation more than 0.05 as input x and quality as target variable y
From the correlation, it is clear that __alcohol__ in the top which means it affects the most with the quality of wine.

In [12]:
correlations = dataset.corr()['quality'].drop('quality')

def get_features(correlation_threshold):
    abs_corrs = correlations.abs()
    high_correlations = abs_corrs
    list((abs_corrs > correlation_threshold).index.values)
    return high_correlations

features = get_features(0.05).sort_values(0, False)
get_features_for_dataset = [x for index, x in enumerate(list(features.keys())) if index < 5]

for x in get_features_for_dataset:
    print(x, features[x])

alcohol 0.5052109662303698
volatile acidity 0.3791924338562807
citric acid 0.22746344536928917
total sulfur dioxide 0.22687625315570425
sulphates 0.21120072840790807


### Build model from correlation

In [13]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

#### Get the X and y, scaling data

In [14]:
XCorr = dataset[get_features_for_dataset]
yCorr = dataset.iloc[:, -1]

scalerCorr = MinMaxScaler(feature_range=(0, 1))
XCorr = scalerCorr.fit_transform(XCorr)

#### Build Model 1

In [15]:
modelCorr = LinearRegression()
CVCorr = KFold(n_splits= 10, random_state=42, shuffle=True)

for train_idx, test_idx in CVCorr.split(XCorr):
    XCorr_train, XCorr_test, yCorr_train, yCorr_test = XCorr[train_idx], XCorr[test_idx], yCorr[train_idx], yCorr[test_idx]
    modelCorr.fit(XCorr_train, yCorr_train)
    
trainCorr_pred = modelCorr.predict(XCorr_train)
testCorr_pred = modelCorr.predict(XCorr_test)

print('CVScore:', cross_val_score(modelCorr, XCorr, yCorr, cv=10).mean()*100.0)
print('Mean Absolute Error:', metrics.mean_absolute_error(yCorr_test, testCorr_pred))
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, test_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(yCorr_test, testCorr_pred)))

X_addC = sm.add_constant(XCorr)
result = sm.OLS(yCorr, X_addC).fit()
print('RSquared:', result.rsquared)
print('Adj RSquared:', result.rsquared_adj)
print('RSquared - Adj RSquared:', result.rsquared - result.rsquared_adj)

print('\nThe linear regression equation: y = ')
print(round(modelCorr.intercept_, 4), end=' ')
co = list(modelCorr.coef_)
# print(co)
# print(get_features_for_dataset)
for i in range(len(co)):
    print(f'+ {round(co[i], 4)} * {get_features_for_dataset[i]}', end=' ')

CVScore: 24.320568221972387
Mean Absolute Error: 0.444628085300261
Root Mean Squared Error: 0.5788243593022208
RSquared: 0.3618527333960505
Adj RSquared: 0.35918042490943436
RSquared - Adj RSquared: 0.0026723084866161484

The linear regression equation: y = 
5.4879 + 1.9735 * alcohol + -1.4188 * volatile acidity + -0.048 * citric acid + -0.8897 * total sulfur dioxide + 0.9546 * sulphates 

### Build a model from 1.b
Furthermore, these columns are in the top 5 of the suitable one to build a model. So I decide to build a model and compare many values.

In [16]:
get_features_for_dataset = [x for index, x in enumerate(CVS_scores) if index < 5]
get_features_for_dataset = list(map(lambda x: x[0], get_features_for_dataset))

In [17]:
X1b = dataset[get_features_for_dataset]
y1b = dataset.iloc[:, -1]

scaler1b = MinMaxScaler(feature_range=(0, 1))
X1b = scaler1b.fit_transform(X1b)

####  Build model 2

In [18]:
model1b = LinearRegression()
CV1b = KFold(n_splits= 10, random_state=42, shuffle=True)

for train_idx, test_idx in CV1b.split(X1b):
    X1b_train, X1b_test, y1b_train, y1b_test = X1b[train_idx], X1b[test_idx], y1b[train_idx], y1b[test_idx]
    model1b.fit(X1b_train, y1b_train)
    
train1b_pred = model1b.predict(X1b_train)
test1b_pred = model1b.predict(X1b_test)

print('CVScore:', cross_val_score(model1b, X1b, y1b, cv=10).mean()*100.0)
print('Mean Absolute Error:', metrics.mean_absolute_error(y1b_test, test1b_pred))
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, test_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y1b_test, test1b_pred)))

X_addC = sm.add_constant(X1b)
result = sm.OLS(y1b, X_addC).fit()
print('RSquared:', result.rsquared)
print('Adj RSquared:', result.rsquared_adj)
print('RSquared - Adj RSquared:', result.rsquared - result.rsquared_adj)

print('\nThe linear regression equation: y = ')
print(round(model1b.intercept_, 4), end=' ')
co = list(model1b.coef_)
# print(co)
# print(get_features_for_dataset)
for i in range(len(co)):
    print(f'+ {round(co[i], 4)} * {get_features_for_dataset[i]}', end=' ')

CVScore: 24.276187813358778
Mean Absolute Error: 0.4590994324974596
Root Mean Squared Error: 0.5895053935566336
RSquared: 0.35314046827743495
Adj RSquared: 0.35043167626854643
RSquared - Adj RSquared: 0.0027087920088885165

The linear regression equation: y = 
5.5524 + 2.0755 * alcohol + -1.5991 * volatile acidity + -0.6942 * total sulfur dioxide + -0.2936 * citric acid + 0.5628 * fixed acidity 

### Build model 3

In [19]:
get_features_for_dataset = [list(dataset.columns)[10], list(dataset.columns)[1], list(dataset.columns)[6], list(dataset.columns)[9]]

In [20]:
X4 = dataset[get_features_for_dataset]
y4 = dataset.iloc[:, -1]

scaler4 = MinMaxScaler(feature_range=(0, 1))
X4 = scaler4.fit_transform(X4)

In [21]:
model4 = LinearRegression()
CV4 = KFold(n_splits= 10, random_state=42, shuffle=True)

for train_idx, test_idx in CV4.split(X4):
    X4_train, X4_test, y4_train, y4_test = X4[train_idx], X4[test_idx], y4[train_idx], y4[test_idx]
    model4.fit(X4_train, y4_train)
    
train4_pred = model4.predict(X4_train)
test4_pred = model4.predict(X4_test)

print('CVScore:', cross_val_score(model4, X4, y4, cv=10).mean()*100.0)
print('Mean Absolute Error:', metrics.mean_absolute_error(y4_test, test4_pred))
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, test_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y4_test, test4_pred)))

X_addC = sm.add_constant(X4)
result = sm.OLS(y4, X_addC).fit()
print('RSquared:', result.rsquared)
print('Adj RSquared:', result.rsquared_adj)
print('RSquared - Adj RSquared:', result.rsquared - result.rsquared_adj)

print('\nThe linear regression equation: y = ')
print(round(model4.intercept_, 4), end=' ')
co = list(model4.coef_)
# print(co)
# print(get_features_for_dataset)
for i in range(len(co)):
    print(f'+ {round(co[i], 4)} * {get_features_for_dataset[i]}', end=' ')

CVScore: 24.387365326606474
Mean Absolute Error: 0.44497654218663407
Root Mean Squared Error: 0.5796540951524313
RSquared: 0.3615747763884387
Adj RSquared: 0.35943778819224936
RSquared - Adj RSquared: 0.002136988196189349

The linear regression equation: y = 
5.468 + 1.9712 * alcohol + -1.3871 * volatile acidity + -0.8947 * total sulfur dioxide + 0.9393 * sulphates 