# 1. Data Understanding and Representation

## Import the dataset and understand the features present

In [18]:
import pandas as pd
dataset = pd.read_csv("Audi.csv", header=0, index_col=None)
print(dataset.info())
dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10668 entries, 0 to 10667
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         10668 non-null  object 
 1   year          10668 non-null  int64  
 2   price         10668 non-null  int64  
 3   transmission  10668 non-null  object 
 4   mileage       10668 non-null  int64  
 5   fuelType      10668 non-null  object 
 6   tax           10668 non-null  int64  
 7   mpg           10668 non-null  float64
 8   engineSize    10668 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 750.2+ KB
None


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0
...,...,...,...,...,...,...,...,...,...
10663,A3,2020,16999,Manual,4018,Petrol,145,49.6,1.0
10664,A3,2020,16999,Manual,1978,Petrol,150,49.6,1.0
10665,A3,2020,17199,Manual,609,Petrol,150,49.6,1.0
10666,Q3,2017,19499,Automatic,8646,Petrol,150,47.9,1.4


**Features Present**:
The features with their descriptions and types
1. model: model of the car (nominal)
2. year: car's registration year (numerical)
3. price: listed price of car (numerical)
4. transmission: type of gearbox in car (nominal)
5. mileage: car's distance used (numerical)
6. fuelType: type of fuel used (nominal)
7. tax: road tax (numerical)
8. mpg: miles per gallon (numerical)
9. engineSize: engine size in litres (numerical)

## Dropping Nominal Features

In [19]:
dataset.drop(["model", "transmission", "fuelType"], axis=1, inplace=True)
dataset

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
0,2017,12500,15735,150,55.4,1.4
1,2016,16500,36203,20,64.2,2.0
2,2016,11000,29946,30,55.4,1.4
3,2017,16800,25952,145,67.3,2.0
4,2019,17300,1998,145,49.6,1.0
...,...,...,...,...,...,...
10663,2020,16999,4018,145,49.6,1.0
10664,2020,16999,1978,150,49.6,1.0
10665,2020,17199,609,150,49.6,1.0
10666,2017,19499,8646,150,47.9,1.4


## Represent the features in matrix format

In [20]:
matrix = dataset.to_numpy()
matrix

array([[2.0170e+03, 1.2500e+04, 1.5735e+04, 1.5000e+02, 5.5400e+01,
        1.4000e+00],
       [2.0160e+03, 1.6500e+04, 3.6203e+04, 2.0000e+01, 6.4200e+01,
        2.0000e+00],
       [2.0160e+03, 1.1000e+04, 2.9946e+04, 3.0000e+01, 5.5400e+01,
        1.4000e+00],
       ...,
       [2.0200e+03, 1.7199e+04, 6.0900e+02, 1.5000e+02, 4.9600e+01,
        1.0000e+00],
       [2.0170e+03, 1.9499e+04, 8.6460e+03, 1.5000e+02, 4.7900e+01,
        1.4000e+00],
       [2.0160e+03, 1.5999e+04, 1.1855e+04, 1.5000e+02, 4.7900e+01,
        1.4000e+00]])

# 2. Implementing PCA using Covariance Matrices

## Center the dataset by subtracting the the mean of each from it

In [21]:
import numpy as np
for i in range(matrix.shape[1]):
  matrix[:, i] -= np.mean(matrix[:, i])
matrix

array([[-1.00674916e-01, -1.03966850e+04, -9.09224400e+03,
         2.39885639e+01,  4.62997750e+00, -5.30708661e-01],
       [-1.10067492e+00, -6.39668504e+03,  1.13757560e+04,
        -1.06011436e+02,  1.34299775e+01,  6.92913386e-02],
       [-1.10067492e+00, -1.18966850e+04,  5.11875600e+03,
        -9.60114361e+01,  4.62997750e+00, -5.30708661e-01],
       ...,
       [ 2.89932508e+00, -5.69768504e+03, -2.42182440e+04,
         2.39885639e+01, -1.17002250e+00, -9.30708661e-01],
       [-1.00674916e-01, -3.39768504e+03, -1.61812440e+04,
         2.39885639e+01, -2.87002250e+00, -5.30708661e-01],
       [-1.10067492e+00, -6.89768504e+03, -1.29722440e+04,
         2.39885639e+01, -2.87002250e+00, -5.30708661e-01]])

## Compute the covariance matrix of the centered dataset

In [22]:
covar = np.cov(matrix.T)
covar

array([[ 4.69802898e+00,  1.50467372e+04, -4.02315568e+04,
         1.35496126e+01, -9.85995164e+00, -4.12750635e-02],
       [ 1.50467372e+04,  1.37237520e+08, -1.47416129e+08,
         2.80256150e+05, -9.10735136e+04,  4.17640150e+03],
       [-4.02315568e+04, -1.47416129e+08,  5.52497116e+08,
        -2.62953810e+05,  1.20264703e+05,  1.00215065e+03],
       [ 1.35496126e+01,  2.80256150e+05, -2.62953810e+05,
         4.51184837e+03, -5.53139078e+02,  1.59198606e+01],
       [-9.85995164e+00, -9.10735136e+04,  1.20264703e+05,
        -5.53139078e+02,  1.67696842e+02, -2.85482417e+00],
       [-4.12750635e-02,  4.17640150e+03,  1.00215065e+03,
         1.59198606e+01, -2.85482417e+00,  3.63556749e-01]])

# 3. Eigenvalue-Eigenvector Equation

##  Formulate and solve the eigenvalue-eigenvector equation