### PCA 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [2]:
df = pd.read_csv('wine.csv') 
X = df.iloc[:, 1:13].values #features
y = df.iloc[:, 0].values #class
df.head()
#df.shape

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


### step 1 
#### to standardize the features since they all are measured in different units 

In [3]:
from sklearn.preprocessing import StandardScaler #standardscalar makes the distribution of the data to mean value 0 and standard deviation of 1
X_std = StandardScaler().fit_transform(X)


###  step 2 
### to find the covariance between the standardized features data set

In [4]:
cov_mat=np.cov(X_std.T)
cov_mat

array([[ 1.00564972,  0.09493026,  0.21273976, -0.31198788,  0.27232816,
         0.29073446,  0.23815287, -0.15681042,  0.13747022,  0.549451  ,
        -0.07215255,  0.07275191],
       [ 0.09493026,  1.00564972,  0.16497228,  0.29013035, -0.05488343,
        -0.3370606 , -0.41332866,  0.29463237, -0.22199334,  0.25039204,
        -0.56446685, -0.37079354],
       [ 0.21273976,  0.16497228,  1.00564972,  0.44587209,  0.28820583,
         0.12970824,  0.11572743,  0.1872826 ,  0.00970647,  0.2603499 ,
        -0.07508874,  0.00393333],
       [-0.31198788,  0.29013035,  0.44587209,  1.00564972, -0.0838039 ,
        -0.32292752, -0.353355  ,  0.36396647, -0.19844168,  0.01883781,
        -0.27550299, -0.27833221],
       [ 0.27232816, -0.05488343,  0.28820583, -0.0838039 ,  1.00564972,
         0.21561254,  0.19688989, -0.25774204,  0.23777643,  0.20107967,
         0.05571118,  0.06637684],
       [ 0.29073446, -0.3370606 ,  0.12970824, -0.32292752,  0.21561254,
         1.00564972,  

### step 3 
#### find the eigenvalue and eigenvector of the covarince matrix,  the eigenvectors of the Covariance matrix are actually the directions of the axes where there is the most variance (most information) and that we call Principal Components. And eigenvalues are simply the coefficients attached to eigenvectors, which give the amount of variance carried in each Principal Component.

In [5]:
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
#print('Eigenvectors \n%s' %eig_vecs)
#print('\nEigenvalues \n%s' %eig_vals)

### step 4

#### to find the feature vector which are the matrix with columns as eigenvectors of the corresponding eigenvalues(pc) selected

In [6]:
eig_vals

array([4.42079223, 2.18758658, 1.42758975, 0.89289016, 0.82042272,
       0.63552536, 0.55097787, 0.34868444, 0.10400644, 0.17046215,
       0.26100162, 0.24785727])

In [7]:
p=np.sum(eig_vals)

In [8]:
p

12.06779661016949

In [9]:
q=np.zeros((1,12))
for i in range(0,12):
    q[0,i]=eig_vals[i]/p
q.shape

(1, 12)

In [10]:
q #this represents the total variation explained by the features 1,2,3 in percentage 36.6%,18.1%,11.8% respectively

array([[0.36632969, 0.18127473, 0.11829747, 0.07398949, 0.06798447,
        0.05266292, 0.04565687, 0.0288938 , 0.00861851, 0.01412537,
        0.02162794, 0.02053873]])

In [11]:
q[0,0]+q[0,1]+q[0,2] #66.59

0.6659018898191488

#### the first three eigenvalues seems to be contributing more about the variance in the feature data set

#### the corresponding eigenvectors are:

In [12]:
egvec=np.column_stack((eig_vecs[0],eig_vecs[1],eig_vecs[2]))
egvec.shape, X_std.shape

((12, 3), (178, 12))

### step 5

#### using the feature vector formed using the eigenvectors of the covariance matrix, to reorient the data from the original axes to the ones represented by the principal components, this is done by multiplying the original data set by the feature vector.






In [13]:
Y = X_std.dot(egvec) 
Y.shape, y.shape # here the dimension of the data is reduced to 3 dim from 12 dim 


((178, 3), (178,))

### PCA using sklearn 

In [14]:
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=3)
Y_sklearn = sklearn_pca.fit_transform(X_std)
Y_sklearn.shape,y.shape
Y_sklearn[:178, :].shape

(178, 3)

In [15]:
print(sklearn_pca.explained_variance_ratio_)  

[0.36632969 0.18127473 0.11829747]


### PCR using multinomial logistic regression 

In [16]:
import time
start_time = time.time()
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(Y_sklearn, y)
print(clf.predict(Y_sklearn[:178, :]))
clf.predict_proba(Y_sklearn[:178, :])
print(clf.score(Y_sklearn, y))
time=(time.time() - start_time)
time

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1
 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 1 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
0.9550561797752809


0.015616416931152344

### multinomial logistic regression on all the predictors/feature wine data set 

In [17]:
import time
start_time1=time.time()
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X, y)
print(clf.predict(X[:178, :]))
clf.predict_proba(X[:1778, :]) 
print(clf.score(X, y))
time1=(time.time()-start_time1)
time1


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 1 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
0.9719101123595506


0.047966957092285156