# Import Libraries

In [0]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

np.set_printoptions(precision=4, suppress=True, edgeitems=5, linewidth=200)

# Loading Data

In [2]:
df_raw = pd.read_csv(
'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', 
header=None)
df = df_raw.copy()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
data = df.iloc[:,1:]
target = df.iloc[:,0]
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
target.value_counts()

2    71
1    59
3    48
Name: 0, dtype: int64

# Test Train Split

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data,target)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')

X_train shape: (133, 13)
X_test shape: (45, 13)


# Standardization

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_train_std[:5]

array([[-1.3942, -0.8283, -1.341 ,  0.4493, -0.8807, -0.4461, -0.3787, -0.5635, -0.0498, -1.3349,  0.0141,  1.0519, -0.7864],
       [ 1.5854, -0.6287, -0.1573, -1.0249,  1.3706,  1.5008,  1.0397, -0.8732,  0.8153,  0.6302, -0.0305,  1.0229,  0.7493],
       [-0.0906,  1.8965,  0.53  ,  0.6786, -0.8807, -0.8929, -1.3412,  0.7527, -1.2091,  1.1936, -1.8146, -1.0817, -0.3661],
       [ 0.108 , -1.3229, -2.41  , -1.1232, -0.8807, -0.494 ,  0.057 , -1.0281, -0.1709, -0.1594,  1.0845, -0.1818, -1.1194],
       [-0.1527,  0.8205, -0.1573,  0.0234, -0.74  , -1.244 , -1.331 ,  0.2107, -1.0188,  2.3513, -1.0118, -1.2413, -0.1721]])

# PCA

In [7]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
X_train_pca.shape

(133, 3)

# Explained Variance

In [10]:
results = pd.DataFrame(data={
    'explained_variance_ratio': pca.explained_variance_ratio_})
results['cumulative'] = results['explained_variance_ratio'].cumsum()
results['component'] = results.index + 1
results

Unnamed: 0,explained_variance_ratio,cumulative,component
0,0.365176,0.365176,1
1,0.186981,0.552157,2
2,0.11163,0.663787,3


In [11]:
fig = go.Figure(data=[go.Bar(x=results['component'],
                             y=results['explained_variance_ratio'], 
                             name='explained variance ratio'),
                      go.Scatter(x=results['component'],
                                 y=results['cumulative'],
                                 name='cumulative explained variance')],
                layout=go.Layout(title=f'PCA - {pca.n_components_} components',
                                 width=950, template='plotly_dark'))
fig.show()

In [12]:
X_train_pca_df = pd.DataFrame(data=np.c_[X_train_pca, y_train], 
                              columns=['pca1', 'pca2', 'pca3', 'target'])
X_train_pca_df.head()

Unnamed: 0,pca1,pca2,pca3,target
0,0.074344,2.773945,-0.373406,2.0
1,-2.890703,-1.589954,-0.527929,1.0
2,3.480405,-1.361916,0.085024,3.0
3,-0.557168,1.994745,-2.656682,2.0
4,2.821119,-1.409086,-1.106345,3.0


In [13]:
px.scatter_3d(X_train_pca_df, x='pca1', y='pca2', z='pca3', color='target', 
              template='plotly_dark', width=950)

In [14]:
X_train_pca[:5]

array([[ 0.0743,  2.7739, -0.3734],
       [-2.8907, -1.59  , -0.5279],
       [ 3.4804, -1.3619,  0.085 ],
       [-0.5572,  1.9947, -2.6567],
       [ 2.8211, -1.4091, -1.1063]])

In [15]:
X_test_pca[:5]

array([[-1.286 , -0.2604,  0.1008],
       [-1.5746, -0.7979,  0.6061],
       [-0.2373,  2.2291, -0.5675],
       [ 2.0813, -1.2574,  0.0018],
       [ 0.3926,  1.8914,  0.5873]])