### Principal component analysis

Reducing the dimensions of d-dimensional dataset by projecting it on to k-dimensional subspace.

### Importing libraries and dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./resources/Datasets/Wine.csv')
data.head()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


In [56]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

### Splitting dataset 

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Feature scaling

In [90]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Principal component analysis

[PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) <br>
SVD: Single Value Decomposition

In [91]:
n_features = len(X_test[0])
n_features

13

#### Without kernel

In [92]:
from sklearn.decomposition import PCA, KernelPCA
pca = PCA(n_components=2)
X_train_scaled = pca.fit_transform(X_train_scaled)
X_test_scaled = pca.transform(X_test_scaled)

In [93]:
kernelpca = KernelPCA(n_components=2, kernel='rbf')
X_train_scaled = kernelpca.fit_transform(X_train_scaled)
X_test_scaled = kernelpca.transform(X_test_scaled)

In [94]:
print(pca.explained_variance_)
# print(pca.components_)
print(pca.explained_variance_ratio_)

[4.82894083 2.52920254]
[0.36884109 0.19318394]


In [95]:
print(kernelpca.explained_variance_)
print(kernelpca.explained_variance_ratio_)

AttributeError: 'KernelPCA' object has no attribute 'explained_variance_'

### Training model

In [96]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train_scaled, y_train)

LogisticRegression(random_state=0)

In [97]:
y_predicted = classifier.predict(X_test_scaled)

### Validation

In [98]:
from sklearn.metrics import accuracy_score, precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score

In [99]:
confusion_matrix(y_test, y_predicted)

array([[14,  0,  0],
       [ 1, 15,  0],
       [ 0,  0,  6]], dtype=int64)

In [100]:
accuracy_score(y_test, y_predicted)

0.9722222222222222