In [1]:
!python --version

Python 3.7.12


In [2]:
!pip install numpy matplotlib sklearn pandas



In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy.linalg import eig

In [4]:
df = pd.read_csv("./iris_dataset.csv")

In [5]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
x = df['SepalLengthCm']
x

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: SepalLengthCm, Length: 150, dtype: float64

In [7]:
y = df['SepalWidthCm']
y

0      3.5
1      3.0
2      3.2
3      3.1
4      3.6
      ... 
145    3.0
146    2.5
147    3.0
148    3.4
149    3.0
Name: SepalWidthCm, Length: 150, dtype: float64

In [8]:
x_mean = sum(x) / len(x)
y_mean = sum(y) / len(y)

print(x_mean, y_mean)

5.843333333333335 3.0540000000000007


## Finding covariance
```
cov(x, y) = E((x[i] - x_mean) * (y[i] - y_mean)) / (N - 1)
```

In [9]:
cov_xy = 0;
cov_xx = 0;
cov_yy = 0;

n = len(x)

for xi, yi in zip(x, y):
    cov_xy += ((xi - x_mean) * (yi - y_mean))
    cov_xx += ((xi - x_mean) * (xi - x_mean))
    cov_yy += ((yi - y_mean) * (yi - y_mean))
cov_xy = cov_xy / (n - 1)
cov_xx = cov_xx / (n - 1)
cov_yy = cov_yy / (n - 1)

print(cov_xy, cov_xx, cov_yy)

-0.03926845637583892 0.6856935123042505 0.18800402684563763


In [10]:
C = [[cov_xx, cov_xy], [cov_xy, cov_yy]]
C

[[0.6856935123042505, -0.03926845637583892],
 [-0.03926845637583892, 0.18800402684563763]]

In [11]:
values, vectors = eig(C)

In [12]:
values

array([0.6887728 , 0.18492474])

In [13]:
vectors

array([[ 0.99693955,  0.07817635],
       [-0.07817635,  0.99693955]])

In [14]:
eig_value = max(values)
value_index = list(values).index(eig_value)
eig_vector = vectors[:, value_index]

print(eig_value, eig_vector)

0.6887728010807849 [ 0.99693955 -0.07817635]


In [15]:
res = [(x[i] * eig_vector[0]) + (y[i] * eig_vector[1]) for i in range(n)]
res = pd.DataFrame(res)
res.head(10)

Unnamed: 0,0
0,4.810774
1,4.650475
2,4.435452
3,4.343575
4,4.703263
5,5.078586
6,4.320122
7,4.718898
8,4.159823
9,4.642657
