In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
iris = load_iris()

# Convert dataset to Dataframe
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                    columns= iris['feature_names'] + ['target'])
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [3]:
X = df.drop(columns=['target'], axis=1)  # Dataframe type
y = df[['target']].astype(int)           # Dataframe type

In [4]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # This is now a numpy ndarray. The first time is used fit_transform(), but later, only transform()
X_scaled

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

In [5]:
piece = X_scaled[:10, :] # This is a ndarray, a piece of X_scaled
original = scaler.inverse_transform(piece)
original # The inverse transform is working well and precisely

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [6]:
model = LogisticRegression(n_jobs=-1)
model.fit(X_scaled, y.values.ravel())

In [7]:
def predicting(input_X):
    """
    :param input_X: np array with like-original values
    :return: prediction column
    """
    scaled_input_X = scaler.transform(input_X)
    return model.predict(scaled_input_X)

In [8]:
test = np.array([[5.1, 3.5, 1.4, 0.2],
                 [4.9, 3.0, 1.4, 0.2]])

pred = predicting(input_X=test)
pred



array([0, 0])

In [9]:
# Printing the model score (how well the model fit data)
model_score = model.score(X_scaled, y)
print(f'{round(model_score*100, 1)}%')

97.3%


In [10]:
# Performing cross validation for obtaining a more confidence value for model score

from sklearn.model_selection import cross_val_score

# The cross validation is applied on the whole sample.
cv_score = cross_val_score(estimator=model,
                           X=X_scaled, y=y.values.ravel(), cv=5, n_jobs=-1)

cv_score_mean = round( cv_score.mean()*100, 1 )
cv_score_std  = round( cv_score.std()*100, 1 )

print(f'({cv_score_mean} +- {cv_score_std})%')

(96.0 +- 3.9)%


In [11]:
# Applying PCA to scaled X data. (Here, we reduce/remove only one dimension)

pca = PCA(n_components=3)
X_pca_reduced = pca.fit_transform(X_scaled)

A = pca.inverse_transform(X_pca_reduced)  # Here, applying inverse of PCA application
B = scaler.inverse_transform(A)           # Here, applying inverse of Scaling application

Let's compare the original X dataframe and B ndarray.
We can observe that this two dataset are not equal. This is because PCA produces loss of information

In [12]:
X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [13]:
B

array([[5.09478837, 3.50129672, 1.43407893, 0.19038654],
       [4.87778763, 3.00552672, 1.54524708, 0.1590267 ],
       [4.69388077, 3.20152254, 1.34001372, 0.18871238],
       [4.61422265, 3.09646122, 1.40699784, 0.22623533],
       [5.00774639, 3.5980726 , 1.34934621, 0.21428912],
       [5.39857501, 3.90035455, 1.70931802, 0.39737145],
       [4.60793028, 3.39802685, 1.34814378, 0.31462832],
       [5.00530735, 3.39867946, 1.46529515, 0.20979002],
       [4.40579192, 2.8985589 , 1.36212654, 0.21068387],
       [4.90860388, 3.09785925, 1.44373909, 0.11587085],
       [5.39639205, 3.7008977 , 1.52359245, 0.19334472],
       [4.82878435, 3.39283809, 1.41177863, 0.25309608],
       [4.79947707, 3.00013011, 1.40341948, 0.09903539],
       [4.30414364, 2.99896901, 1.07290469, 0.10764342],
       [5.75800806, 4.01044813, 1.47458602, 0.12254098],
       [5.68910291, 4.40271133, 1.57125626, 0.37989905],
       [5.35928343, 3.9101308 , 1.56624637, 0.32489355],
       [5.07992599, 3.50499467,

In [14]:
# Eigenvalues (magnitude of eigenvectors) of the three eigenvectors that maximize the variance (keeps the most information)
pca.explained_variance_

array([2.93808505, 0.9201649 , 0.14774182])

In [15]:
# Percentage/ratio in between the eigenvalues (the sum is < 1.0)
pca.explained_variance_ratio_

array([0.72962445, 0.22850762, 0.03668922])

In [16]:
# Components of the three normalized eigenvectors which are 4-dimensional.
# The euclidean norm is set to 1.0 here (e.g.: sqrt(sum(components^2)) = 1.0)
pca.components_

array([[ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654],
       [ 0.37741762,  0.92329566,  0.02449161,  0.06694199],
       [-0.71956635,  0.24438178,  0.14212637,  0.63427274]])