**IMPORT LIBRARIES**

In [None]:
import pandas as pd

from plotly.offline import plot, iplot, init_notebook_mode
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
init_notebook_mode(connected=True)
import plotly.io as pio
import plotly.figure_factory as ff

import matplotlib.pyplot as plt
import numpy as np

from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2
import seaborn as sns

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.svm import SVR

In [None]:
from matplotlib.colors import ListedColormap
from matplotlib.lines import Line2D
def PlotBoundary(clf,X,y,X_v,y_v,meta):
    
    cmap_light = ListedColormap(['#FFAAAA', '#ABE3FF', '#ccffcc'])
    cmap_bold = ListedColormap(['#ff575d','#57b1ff','#57ffa5'])
    
    h = 0.03
    offset_x = 0.3
    offset_y = 0.5
    
    x_min, x_max = X[:, 0].min() - offset_x, X[:, 0].max() + offset_x
    y_min, y_max = X[:, 1].min() - offset_y, X[:, 1].max() + offset_y
    xx, yy = np.meshgrid( np.arange(x_min, x_max, h), np.arange(y_min, y_max, h) )
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    
    Z = Z.reshape(xx.shape)
    plt.figure()
    ax = plt.subplot(111)
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    
    size = 40
    plt.scatter(X[:, 0], X[:, 1],s=size, c=y, cmap=cmap_bold, label='Train')
    plt.scatter(X_v[:, 0], X_v[:, 1],s=size, c=y_v, cmap=cmap_bold, edgecolors='black', linewidth=0.7, label='Test/Validation')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())

    legend_elements = [Line2D([0], [0], marker='o', color='w', label='Train', markerfacecolor='#00FF00', markersize=10),
                       Line2D([0], [0], marker='o', color='w', label='Test/Validation', markerfacecolor='#00FF00', markersize=8, markeredgecolor='black', markeredgewidth=1) ]
    
    plt.legend(loc=(1.04,0.8),fontsize=12,handles=legend_elements)
    plt.title(f"{meta}")

    plt.show()

**Load Datasets**

In [None]:
white_df = pd.read_csv('winequality-white.csv', sep=';')
red_df = pd.read_csv('winequality-red.csv', sep=';')

**Data Exploration**

In [None]:
print('Length:')
print('White:')
print(white_df.shape)
print('--- --- ---')
print('Red:')
print(red_df.shape)

In [None]:
print('Description:')
print('White:')
print(white_df.describe())
print('--- --- ---')
print('Red:')
print(red_df.describe())

In [None]:
print('Null Values:')
print('White:')
print(white_df.isnull().sum())
print('--- --- ---')
print('Red:')
print(red_df.isnull().sum())

In [None]:
#An example of duplicates
print(white_df.iloc[3])
print(white_df.iloc[4])

In [None]:
print('Duplicates:')
print('White:')
print(white_df.duplicated().sum())
print('--- --- ---')
print('Red:')
print(red_df.duplicated().sum())
white_df.drop_duplicates(keep = 'first', inplace = True) 
red_df.drop_duplicates(keep = 'first', inplace = True) 

In [None]:
colors = {0:'#a9a9a9', 1:'#f032e6', 2:'#911eb4', 3:'#4363d8', 4:'#42d4f4', 5:'#3cb44b',6:'#bfef45',7:'#ffe119',8:'#f58231',9:'#e6194B', 10:'#469990'}
colors_cat = {'low':'red', 'medium':'green', 'high':'blue'}

def PieBarChart(df,col='quality',meta=''):
    dict_ = df[col].value_counts()
    
    newDict = dict(sorted(filter(lambda elem: elem[1]  > 0, dict_.items())))
    if(col=='quality'):
        newcolors = dict(filter(lambda elem: elem[0] in newDict.keys(), colors.items()))
    else:
        newcolors = colors_cat
    
    fig = make_subplots(rows=1, cols=2, subplot_titles=['Pie Chart','Bar Chart'], specs=[[{'type':'domain'}, {'type':'bar'}]])
    
    pie = go.Pie( labels=list(newDict.keys()), values=list(newDict.values()),
                 marker_colors=list(newcolors.values()),sort=True, showlegend=False)
    fig.add_trace(pie, row=1, col=1)
    fig.update_traces( hoverinfo='value', textinfo='label+percent',
                      marker=dict( colors=list(newcolors.values()) ) )
    
    bar = go.Bar( x=list(newDict.keys()), y=list(newDict.values()),
                 text=list(newDict.values()), textposition='auto', marker_color=list(newcolors.values()), showlegend=False )
    
    fig.add_trace(bar, row=1, col=2)
    if(col!='quality'):
        fig.update_layout(barmode='stack', xaxis={'categoryorder':'array', 'categoryarray':['low','medium','high']})
    
    fig.update_xaxes(title_text="Quality", row=1, col=2)
    fig.update_yaxes(title_text="# of Samples", row=1, col=2)
    
    fig.update_layout(title_text=meta, showlegend=True, legend_title_text='Quality')
    
    fig.show()
    pio.write_html(fig, file='Images/'+meta+'.html', auto_open=False)

In [None]:
PieBarChart(white_df,meta='White Wine')

In [None]:
PieBarChart(red_df,meta='Red Wine')

In [None]:
#Wine Statistics
white_df['type'] = ['white'] * white_df.shape[0]
red_df['type'] = ['red'] * red_df.shape[0]
wines = pd.concat([white_df,red_df])
wines.reset_index(drop=True,inplace=True)

attributes = wines.columns
rs = round(wines.loc[wines.type == 'red', attributes].describe(),2)
ws = round(wines.loc[wines.type == 'white', attributes].describe(),2)
pd.concat([rs, ws], axis=0, keys=['Red Wine Statistics', 'White Wine Statistics']).T

In [None]:
#Classification [low,medium,high]
from sklearn import preprocessing
bins = (2,4,6,10)
cat_names = ['low', 'medium', 'high']
wines['labels'] = pd.cut(wines['quality'], bins=bins, labels=cat_names)

subset_attributes = ['alcohol','pH']
ls = round(wines[wines['labels'] == 'low'][subset_attributes].describe(),2)
ms = round(wines[wines['labels'] == 'medium'][subset_attributes].describe(),2)
hs = round(wines[wines['labels'] == 'high'][subset_attributes].describe(),2)
pd.concat([ls, ms, hs], axis=0, keys=['Low Quality Wine', 'Medium Quality Wine', 'High Quality Wine']).T



In [None]:
def BoxPlot(df):
    fig = go.Figure()
    for i in df.columns:
        fig.add_trace(go.Box(y=df[i],name=i))
    fig.show()

In [None]:
BoxPlot(white_df.iloc[:,:-2])

In [None]:
def Normalize(df):
    X = df.iloc[:,:-1]
    y = df['quality']

    x = X.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler() #StandardScaler
    x_scaled = min_max_scaler.fit_transform(x)
    new_df = pd.DataFrame(x_scaled, columns=X.columns)


    new_df['quality'] = y.values
    return new_df

In [None]:
normalized_df = Normalize(white_df)

In [None]:
BoxPlot(normalized_df.iloc[:,:-2])

In [None]:
for i in white_df.columns:
    counts = dict(white_df[i].value_counts())

    fig = go.Figure( go.Bar( x=list(counts.keys()), y=list(counts.values()) ) )
    #fig.update_layout(yaxis_type="log")
    fig.update_layout(title_text=i)
    fig.show()

In [None]:
sns.pairplot(white_df)
plt.show()

In [None]:
def RemoveOutliers(df):
    new_df = df.iloc[:,:-1]
    y = df['quality']
    
    for i in new_df.columns:
        Q1 = new_df[i].quantile(0.25)
        Q3  = new_df[i].quantile(0.75)
        '''
        if (i=='volatile acidity'):
            Q3 = 0.2255
        elif (i=='citric acid'):
            Q3 = 0.22289
        elif (i=='free sulfur dioxide'):
            Q3 = 0.14982
        elif (i=='pH'):
            Q3 = 0.5091
        '''
        IQR = Q3 - Q1    #IQR is interquartile range. 

        '''
        print(i)
        print('Q1 {} Q3 {}'.format(Q1,Q3))
        print(IQR * 1.5)
        print('Min {} max {}'.format(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR))
        print()
        '''
        new_df = new_df[(new_df[i] >= Q1 - 1.5 * IQR) & (new_df[i] <= Q3 + 1.5 * IQR )]
    
    y_filtered = []
    for i in new_df.index:
        y_filtered.append(y[i])

    new_df['quality'] = y_filtered
    new_df = new_df.reset_index(drop=True)
    print('Old ', df.shape)
    print('New ', new_df.shape)
    print('Ratio {:.4}'.format( (df.shape[0]-new_df.shape[0])/df.shape[0] ))
    
    return new_df

In [None]:
filtered_df = RemoveOutliers(normalized_df)

In [None]:
PieBarChart(white_df,meta='White Wine')

In [None]:
PieBarChart(filtered_df,col='quality',meta='White Wine Filtered')

In [None]:
BoxPlot(filtered_df.iloc[:,:-1])

In [None]:
df_ = white_df
df_ = filtered_df

data = df_.iloc[:,0:-1]
target = df_.iloc[:,-1]

bestfeatures = SelectKBest(score_func=f_classif, k=2)
fit = bestfeatures.fit(data,np.ravel(target))
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(data.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
print(featureScores.nlargest(10,'Score'))

#Using Pearson Correlation
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(10,10))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")
plt.show()

In [None]:
def ScatterPlot(df_,items=[]):
    fig = go.Figure()

    for i in items:
        df_qual = df_[(df_.iloc[:,-1] == i)]
        if(len(df_qual)!=0):
            x = df_qual.iloc[:,0]
            y = df_qual.iloc[:,1]
            if(isinstance(items[0], int)):
                color = [colors[i]] * len(df_qual)
            else:
                color = [colors_cat[i]] * len(df_qual)
            fig.add_trace(go.Scattergl(x=x, y=y, mode='markers', name=i ))
            #fig = px.scatter(df, x=x, y=y, marginal_y="rug", marginal_x="histogram")
            
    #fig.update_layout( xaxis_type="log", yaxis_type="log")
    fig.show()

In [None]:
for i in range(0,11):
    for j in range(0,11):
        ScatterPlot(white_df.iloc[:,[i,j,-2]],range(0,11))

In [None]:
ScatterPlot(filtered_df.iloc[:,[7,10,-1]],range(0,11))

In [None]:
import plotly.express as px
def ScatterPlot3D(df_,xname,yname,zname):
    x = xname
    y = yname
    z = zname
    color = 'quality'
    size = [1] * len(df_)
    fig = px.scatter_3d(df_,x=x, y=y, z=z, color=color, size=size)
    fig.show()


In [None]:
ScatterPlot3D(filtered_df,'','','')

In [None]:
df_ = filtered_df
data = df_.iloc[:,:-1]
target = df_['quality']

n = 3
pca = PCA(n_components=n)
principalComponents = pca.fit_transform(data)
columns = ['principal component' + str(i+1) for i in range(n)]
principalDf = pd.DataFrame(data = principalComponents
             , columns = columns)

print ( "Components = ",pca.n_components_, "; Total explained variance =", round(pca.explained_variance_ratio_.sum(),5)  )
principalDf['quality'] = target

In [None]:
ScatterPlot3D(principalDf,columns[0],columns[1],columns[2])

In [None]:
def SplitDataset(df,columns=[], size=0.2):
    data = np.array(df.iloc[:,columns])
    target = df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=size, random_state=2)
    y_train = list(y_train)
    y_test = list(y_test)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = SplitDataset(filtered_df,[7,10])

In [None]:
def Train(model):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, y_pred

In [None]:
def SVM(best_c=-1,best_g=-1):
    if(best_c==-1 or best_g==-1):
        C = [0.01, 0.1, 1, 10, 100]
        gamma = [0.01, 0.1, 1, 10, 100]
        score_svm = {}
        for c in C:
            for g in gamma:
                print(c,g)
                RBFsvm = SVC(kernel='rbf', C=c, gamma=g, random_state=0)

                accuracy, _ = Train(RBFsvm)
                
                score_svm[(c,g)] = accuracy
                print(f"#Accuracy: {accuracy:.9f}")
                print()

        values = np.reshape(list(score_svm.values()),(len(C),len(gamma)))
        pd_scores = pd.DataFrame(values, index=C, columns=gamma)
        sns.heatmap(pd_scores, vmin=0, vmax=1, linewidths=.1, annot=True,xticklabels='auto', yticklabels='auto')
        plt.xlabel("Gammma")
        plt.ylabel("C")
        
    else:
        RBFsvm = SVC(kernel='rbf', C=best_c, gamma=best_g, random_state=0)

        accuracy, y_pred = Train(RBFsvm)
        
        print(f"#Accuracy: {accuracy:.5f}")
        print(confusion_matrix(y_test,y_pred))

In [None]:
SVM()

In [None]:
best_c = 1
best_g = 10

In [None]:
SVM(best_c,best_g)

In [None]:
svr = SVR(kernel='rbf', C=best_c, gamma=best_g)

svr.fit(X_train, y_train)

y_pred = svr.predict(X_test)

T = 0
new_y = []
count = 0

for i in range(len(y_pred)):
    diff = abs( y_pred[i]-y_test[i] )
    if(diff<=T):
        count += 1
        #print('Predicted {:.6}, Real {}, diff {:.6} {}'.format(y_pred[i],y_test[i],diff,diff<=T ))
        new_y.append(y_test[i])
    else:
        #print('WRONG--- Predicted {:.6}, Real {}, diff {:.6} NEW {} {}'.format( y_pred[i],y_test[i],diff,round(y_pred[i]),diff<=T ))
        new_y.append(round(y_pred[i]))
    
from sklearn.metrics import mean_absolute_error
MAD = mean_absolute_error(y_test, new_y)

from sklearn.metrics import r2_score
r2 = r2_score(y_test, new_y)

print(f"#Accuracy: {accuracy_score(y_test, new_y):.5f}")
print(confusion_matrix(y_test,new_y))
print(set(new_y))
print(set(y_test))
print(count)
print(MAD)
print(r2)


values = confusion_matrix(y_test,new_y)
pd_scores = pd.DataFrame(values)
_ = sns.heatmap(pd_scores, vmin=0, vmax=50, linewidths=.1, annot=True,xticklabels='auto', yticklabels='auto')

print()
print()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

x = X_train
y = y_train

for i in range(2,20):
    print(i)
    transformer = PolynomialFeatures(degree=i, include_bias=True)
    transformer.fit(x)
    x_ = transformer.transform(x)
    x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)

    model = LinearRegression().fit(x_, y)
    y_pred = model.predict(x_)
    r_sq = model.score(x_, y)

    T=0.5
    for i in range(len(y_pred)):
        if(abs(y_pred[i]-y[i])<=T):
            y_pred[i] = y[i]
        else:
            y_pred[i] = round(y_pred[i])

    print(f"#Accuracy: {accuracy_score(y, y_pred):.5f}")
    print(confusion_matrix(y,y_pred))
    print(set(y_pred))
    print()

In [None]:
#Classification [low,medium,high]
from sklearn import preprocessing
df_ = white_df
bins = (2,4,6,10)
cat_names = ['low', 'medium', 'high']
df_['labels'] = pd.cut(df_['quality'], bins=bins, labels=cat_names)

In [None]:
df_['labels'] = pd.cut(df_['quality'], bins=bins)
df_[['quality','labels']].head(50)

In [None]:
PieBarChart(df_,col='labels')

In [None]:
ScatterPlot(df_.iloc[:,[4,7,-1]],['low','medium','high'])

In [None]:
X_train, X_test, y_train, y_test = SplitDataset(df_,[7,10])

In [None]:
SVM()

In [None]:
best_c = 1
best_g = 10
SVM(best_c,best_g)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

K = [1,3,5,7]
K_ext = [1,2,3,4,5,6,7,9,15,30,60,88]
accuracy = dict()
for k in K:
    nbr = KNeighborsClassifier(n_neighbors=k)
    
    meta = "KNN, K=" + str(k)
    acc = Train(nbr)
    
    accuracy[k] = acc
    print(f'k={k} accuracy={acc:5f}')

In [None]:
from sklearn.ensemble import RandomForestClassifier

# train model
rfc = RandomForestClassifier(n_estimators=100)
acc = Train(rfc)
    
print(f'accuracy={acc:5f}')

In [None]:
white_df.hist()

In [None]:
#Must merge data
sns.set(font_scale=1.0)
g = sns.pairplot(data = wines, hue='type', palette={'red': '#FF9999', 'white': '#FFE888'},plot_kws=dict(edgecolor='black', linewidth=0.5))
fig = g.fig 
fig.subplots_adjust(top=0.96, wspace=0.2)
t = fig.suptitle('Wine Attributes Pairwise Plots by Types', fontsize=24)
plt.show()

In [None]:
from matplotlib.colors import ListedColormap
from matplotlib.lines import Line2D
def PlotBoundary(model,X,y):#,X_v,y_v,meta
    
    cmap_light = ListedColormap(['#FFAAAA', '#ABE3FF', '#ccffcc'])
    cmap_bold = ListedColormap(['#ff575d','#57b1ff','#57ffa5'])
    
    h = 0.03
    offset_x = 0.3
    offset_y = 0.5
    
    x_min, x_max = X[:, 0].min() - offset_x, X[:, 0].max() + offset_x
    y_min, y_max = X[:, 1].min() - offset_y, X[:, 1].max() + offset_y
    xx, yy = np.meshgrid( np.arange(x_min, x_max, h), np.arange(y_min, y_max, h) )
    y_ = np.arange(y_min, y_max, h)
    
    fig = make_subplots(rows=1, cols=2, subplot_titles="Random Forest (Depth = 50)" )
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
                  
    trace1 = go.Heatmap(x=xx[0], y=y_, z=Z,
                  colorscale='Viridis',
                  showscale=False)

    trace2 = go.Scatter(x=X[:, 0], y=X[:, 1], 
                        mode='markers',
                        showlegend=False,
                        marker=dict(size=10,
                                    color=y, 
                                    colorscale='Viridis',
                                    line=dict(color='black', width=1))
                        )

    fig.append_trace(trace1, 1, 1)
    fig.append_trace(trace2, 1, 1)
    


    trace3 = go.Heatmap(x=xx[0], y=y_, 
                        z=Z,
                        colorscale='Viridis',
                        showscale=True)

    trace4 = go.Scatter(x=X[:, 0], y=X[:, 1],
                        mode='markers',
                        showlegend=False,
                        marker=dict(size=10,
                                    color=y, 
                                    colorscale='Viridis',
                                    line=dict(color='black', width=1))
                       )
    fig.append_trace(trace3, 1, 2)
    fig.append_trace(trace4, 1, 2)

    for i in map(str, range(1, 3)):
        x = 'xaxis' + i
        y = 'yaxis' + i
        fig['layout'][x].update(showgrid=False, 
                                zeroline=False,
                                showticklabels=False, 
                                ticks='', 
                                autorange=True)
        fig['layout'][y].update(showgrid=False, 
                                zeroline=False,
                                showticklabels=False, 
                                ticks='', 
                                autorange=True)

    fig.show()

In [None]:
PlotBoundary(RBFsvm,X_train,y_train)