In [19]:
# Import needed packages
# You may add or remove packages should you need them
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import load_iris
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import KFold, LeaveOneOut, cross_val_score

# Set random seed
np.random.seed(0)

# Display plots inline and change plot resolution to retina
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# Set Seaborn aesthetic parameters to defaults
sns.set()

In [20]:
# Load the Iris dataset included with scikit-learn
iris = load_iris()

## Loading the data

#### Task at hand
* Use PCA with 95% variation in data 
* Train a model using the Principal COmponents
* Predict
* Compare performance against the original data

* Load data
* clean the data
* Split the data
* Nomalize the data
* Apply the PCA on train data, transform the test data
* Create the model and predict for original data and the PCA data
* Check for the accuracy
* Compare the performance

In [21]:
df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


## Cleaning the data

In [22]:
df_iris['target'] = iris.target
df_iris['class'] = iris.target_names[iris.target]

df_iris.columns = [col.replace('(cm)', '').strip() for col in df_iris.columns]

df_iris

Unnamed: 0,sepal length,sepal width,petal length,petal width,target,class
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


In [23]:
# Assigning feature and target

X, y = df_iris.iloc[:,0:4].values, df_iris['target'].values


## Splitting the data

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0,stratify=y)

## Normalize the data

In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Applying the PCA on the Train data

In [26]:
# importing PCA
from sklearn.decomposition import PCA

pca= PCA(n_components=3)

pca.fit(X_train_scaled)

PCA(n_components=3)

In [27]:
# checking the varaince of individual components
np.cumsum(pca.explained_variance_ratio_)

array([0.72454601, 0.96097116, 0.9951999 ])

In [28]:
# Transforming the data based on 95% explained varaince
pca=PCA(n_components=2)
X_train_pca=pca.fit_transform(X_train_scaled)
X_test_pca=pca.transform(X_test_scaled)

### Predicting based on the orginal data using the Logistic Regression model

In [29]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()


In [30]:
# fitting the model
model.fit(X_train_scaled, y_train)


LogisticRegression()

In [31]:
# predict the model
prediction = model.predict(X_test_scaled)
prediction

array([2, 2, 0, 0, 1, 0, 1, 2, 0, 1, 0, 2, 0, 2, 1, 1, 1, 1, 1, 0, 1, 2,
       0, 1, 2, 2, 2, 2, 1, 2, 1, 0, 0, 1, 1, 2, 1, 0, 0, 1, 0, 2, 0, 0,
       2])

In [32]:
# checking accuracy 
accuracy_score_original = model.score(X_test_scaled, y_test)
accuracy_score_original

0.9777777777777777

### Predicting the PCA

In [33]:
model_pca=LogisticRegression()
model_pca.fit(X_train_pca, y_train)

LogisticRegression()

In [34]:
pred = model_pca.predict(X_test_pca)
pred

array([2, 2, 0, 0, 2, 0, 1, 1, 0, 1, 0, 2, 0, 2, 1, 2, 2, 1, 1, 0, 1, 2,
       0, 1, 2, 2, 2, 2, 1, 2, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 2, 0, 0,
       2])

In [35]:
acc_score_pca=model_pca.score(X_test_pca, y_test)
acc_score_pca

0.9111111111111111