###### https://matrixcalc.org/en/vectors.html#eigenvectors(%7B%7B1295%2e2,280%2e5,562%2e4%7D,%7B280%2e5,112,83%2e5%7D,%7B562%2e4,83%2e5,290%2e8%7D%7D)

###### https://calculator.academy/normalize-vector-calculator/#f1p1|f2p0

In [1]:
import time
from datetime import timedelta
import numpy as np
import pandas as pd
import sklearn.decomposition
import scipy.linalg as la
import matplotlib.pyplot as plt

from sklearn import svm, model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline

Read dataset

In [2]:
dataset = pd.read_csv('df_original_100000.csv')
# dataset = pd.DataFrame([[71, 29, 33, 1], [75, 19, 43, 1], [7, 9, 3, 1], [13, 21, 7, 0], [3, 2, 17, 1]])
# dataset.columns = ['A', 'B', 'C', 'Label']
# print(dataset)

Erase label column

In [3]:
y = np.array(dataset['Label'])
del dataset['Label']

X = np.array(dataset)

print('Dataset', X)

Dataset [[4.42500000e+03 4.91530000e+04 6.00000000e+00 ... 1.84613993e+03
  1.47455281e+15 1.47455281e+15]
 [4.42500000e+03 4.91530000e+04 6.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [4.42600000e+03 4.91530000e+04 6.00000000e+00 ... 1.81779693e+03
  1.47455281e+15 1.47455281e+15]
 ...
 [5.12970000e+04 8.00000000e+01 6.00000000e+00 ... 5.64182393e+06
  1.52809574e+15 1.52809573e+15]
 [3.04180000e+04 8.00000000e+01 6.00000000e+00 ... 7.64047863e+14
  1.52809574e+15 1.01422070e+07]
 [2.84320000e+04 8.00000000e+01 6.00000000e+00 ... 5.77565896e+14
  1.52809574e+15 1.01559360e+07]]


Center the data and calculate the covariance matrix

In [4]:
X_centered = X - np.mean(X, axis=0)
cov_matrix = np.cov(X_centered, rowvar=False)
print('Covariance Matrix:\n', cov_matrix)
print('Cetered Data:\n', X_centered)

Covariance Matrix:
 [[ 4.65340813e+08 -1.59731453e+08  1.28526515e+04 ...  2.68721598e+18
   1.68887209e+18 -2.56082525e+18]
 [-1.59731453e+08  3.68976480e+08  2.02466044e+03 ... -2.38838178e+18
  -2.52724056e+18  1.43734855e+18]
 [ 1.28526515e+04  2.02466044e+03  1.64963408e+01 ... -2.00876646e+14
  -6.73520889e+14 -3.72880894e+14]
 ...
 [ 2.68721598e+18 -2.38838178e+18 -2.00876646e+14 ...  1.81115403e+29
   6.77569042e+28 -2.30215977e+29]
 [ 1.68887209e+18 -2.52724056e+18 -6.73520889e+14 ...  6.77569042e+28
   2.76121946e+29  1.59246881e+29]
 [-2.56082525e+18  1.43734855e+18 -3.72880894e+14 ... -2.30215977e+29
   1.59246881e+29  5.57136195e+29]]
Cetered Data:
 [[-2.33511212e+04  3.88822834e+04 -1.21715000e+00 ... -2.92495961e+14
   1.81956880e+14  6.87486603e+14]
 [-2.33511212e+04  3.88822834e+04 -1.21715000e+00 ... -2.92495961e+14
  -1.29259593e+15 -7.87066205e+14]
 [-2.33501212e+04  3.88822834e+04 -1.21715000e+00 ... -2.92495961e+14
   1.81956880e+14  6.87486603e+14]
 ...
 [ 2.3520

Calculate the eigenvalues and eigenvectors of the covariance matrix

In [5]:
evals, evecs = la.eig(cov_matrix)

# sort them
idx = np.argsort(evals)[::-1]

# Each columns of this matrix is an eingvector
evecs = evecs[:,idx]
evals = evals[idx]

variance_retained=np.cumsum(evals)/np.sum(evals)

print('Variance Retained:\n', variance_retained)
print('Eigenvalues:\n', evals)
print('Eignvectors:\n', evecs)

Variance Retained:
 [0.70373633+0.j 0.98920786+0.j 0.99997622+0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1.        +0.j 1.        +0.j 1.        +0.j 1.        +0.j
 1. 

Dimensionality Reduction

In [6]:
X_transformed=np.dot(evecs.T, X_centered.T).T
print('Transformed Data:\n', X_transformed)

Transformed Data:
 [[-8.30561378e+14 -1.69515659e+14 -9.32315056e+12 ...  8.89064358e-04
   1.72921543e-03  3.60260032e+04]
 [ 1.54024565e+15 -1.11921815e+15  6.71275015e+12 ... -2.13685573e-03
  -4.47837391e-03  2.68030918e+04]
 [-8.30561378e+14 -1.69515659e+14 -9.32315056e+12 ...  8.78613576e-04
   1.70818163e-03  3.60259585e+04]
 ...
 [-9.16648454e+14 -1.35030719e+14 -9.90543473e+12 ... -2.27588126e-04
  -5.84261640e-04 -1.37377048e+04]
 [ 6.07521265e+14  7.07155891e+14  1.70078623e+14 ...  4.82531365e-04
   9.65899775e-04 -1.10971209e+03]
 [ 4.81058038e+14  6.49106148e+14  3.74127943e+14 ... -9.81458994e-04
  -1.55669824e-03 -4.15678230e+03]]


Generate random eignvectors with the same mean and standard deviation values

In [7]:
new_evecs = []

for j in range(len(evecs[0])):
    v = np.array(evecs[ : , j:j+1 ])
    u = np.random.normal(loc=v.mean(axis=0), scale=v.std(axis=0), size=len(v))
    new_evecs.append(u)
    print(u)
new_evecs = np.array(new_evecs).T

print('Generated Eignvectors:\n', new_evecs)

[-0.11759717  0.13321676 -0.17674104 -0.00052465 -0.16514697 -0.04666722
 -0.02295965 -0.00895749  0.05848805  0.13422583  0.1080643   0.04574192
 -0.05425494  0.02983291  0.20222002  0.04704672 -0.04107594  0.04641098
 -0.05324237  0.08371862  0.14780683 -0.12172683  0.02472748 -0.04004319
 -0.09590342  0.03368226 -0.01041929  0.02530534 -0.05288133  0.05703874
 -0.02560626 -0.13686853  0.09809515  0.01881981 -0.02722909 -0.21700903
  0.05579128  0.04932521  0.06371697 -0.11965386 -0.08858708 -0.16790918
 -0.02409708 -0.06705538  0.04444765 -0.14720063 -0.16680844  0.09261717
 -0.14690763 -0.09406964  0.11299853  0.0293708  -0.02991563  0.17605123
  0.03830599  0.05246145 -0.00476183 -0.03976021 -0.06677549 -0.14411079
  0.04060588 -0.00920668  0.01426043  0.07113131 -0.04208846 -0.2919632
  0.23767051  0.03184275 -0.14856154  0.11325228 -0.10044443 -0.02413853
  0.0455849  -0.09725481  0.03898078 -0.13298148  0.06910679 -0.03626859
 -0.09018258]
[ 0.00469382 -0.01509126  0.03747697  

Go Back to the Original Dimension

In [8]:
X_original_dimension = np.dot(X_transformed, new_evecs.T) 
X_original_dimension += np.mean(X, axis=0)
print(X_original_dimension)

[[ 9.69290221e+13 -1.07695155e+14  1.39611737e+14 ...  2.32479834e+14
   1.32751137e+15  8.93355432e+14]
 [-1.86412377e+14  2.21784869e+14 -3.13594117e+14 ...  3.81199958e+14
   1.27920417e+15  8.47699239e+14]
 [ 9.69290221e+13 -1.07695155e+14  1.39611737e+14 ...  2.32479834e+14
   1.32751137e+15  8.93355432e+14]
 ...
 [ 1.07217526e+14 -1.19659006e+14  1.56068229e+14 ...  2.27079618e+14
   1.32926547e+15  8.95013263e+14]
 [-6.87888684e+13  6.27190871e+13 -6.65917843e+13 ...  3.44857687e+14
   1.27120229e+15  5.86631942e+14]
 [-5.68490003e+13  4.01762238e+13 -2.40262955e+13 ...  3.32200737e+14
   1.31126954e+15  5.85025611e+14]]


Split train and test datasets

In [9]:
# x_train, x_test, y_train, y_test = train_test_split(X_original_dimension, y, test_size = 0.3, random_state = 7)

Cross Validation

In [13]:
def K_fold_cross_validation(model):
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
    
    kfold = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)
    
    results = model_selection.cross_validate(estimator=model,
                                          X=X_original_dimension,
                                          y=y,
                                          cv=kfold,
                                          scoring=scoring)
    
    for k in results.keys():
        if k != 'fit_time' and k != 'score_time':
            print(k, '---> mean:', results[k].mean(), ' |  std:', results[k].std())

KNN

In [14]:
knn = make_pipeline(preprocessing.StandardScaler(), KNeighborsClassifier(n_neighbors=5))
K_fold_cross_validation(knn)

test_accuracy ---> mean: 0.99992  |  std: 5.9999999999993395e-05
test_precision ---> mean: 0.9998399642124642  |  std: 0.00011919809069980397
test_recall ---> mean: 1.0  |  std: 0.0
test_f1_score ---> mean: 0.9999199721499437  |  std: 5.9608813907903486e-05


Decision Tree

In [15]:
dtree = make_pipeline(preprocessing.StandardScaler(), DecisionTreeClassifier())
K_fold_cross_validation(dtree)

test_accuracy ---> mean: 0.99999  |  std: 2.9999999999996697e-05
test_precision ---> mean: 0.9999800836486756  |  std: 5.9749053973301435e-05
test_recall ---> mean: 1.0  |  std: 0.0
test_f1_score ---> mean: 0.9999900408325864  |  std: 2.9877502240793773e-05


Random Forest

In [16]:
rfc = make_pipeline(preprocessing.StandardScaler(), RandomForestClassifier(n_estimators=100))
K_fold_cross_validation(rfc)

test_accuracy ---> mean: 0.99999  |  std: 2.9999999999996697e-05
test_precision ---> mean: 0.9999800836486756  |  std: 5.9749053973301435e-05
test_recall ---> mean: 1.0  |  std: 0.0
test_f1_score ---> mean: 0.9999900408325864  |  std: 2.9877502240793773e-05


Naive Bayes

In [17]:
nb = make_pipeline(preprocessing.StandardScaler(), GaussianNB(var_smoothing=1e-02))
K_fold_cross_validation(nb)

test_accuracy ---> mean: 0.77668  |  std: 0.0045810042567105275
test_precision ---> mean: 0.7069981552587474  |  std: 0.005838800048746294
test_recall ---> mean: 0.9450483414013485  |  std: 0.004520301279681293
test_f1_score ---> mean: 0.8088527507571559  |  std: 0.004013665770401915


In [18]:
print("Sucesso", " :)" * 99)

Sucesso  :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :) :)
