<a href="https://colab.research.google.com/github/N786h/MachineLearning/blob/main/PCA_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [78]:
# Import libraries
import numpy as np
import pandas as pd

In [79]:
# Load the data from github
import requests
from io import StringIO

url = "https://raw.githubusercontent.com/Toulik-Das/Placement-Data-Analysis/master/Placement_Data.csv"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0"}
req = requests.get(url, headers=headers)
data = StringIO(req.text)

data = pd.read_csv(data)
data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [80]:
# Select the specific columns
data = data[['hsc_p', 'ssc_p', 'degree_p', 'status']]
data.head()

Unnamed: 0,hsc_p,ssc_p,degree_p,status
0,91.0,67.0,58.0,Placed
1,78.33,79.33,77.48,Placed
2,68.0,65.0,64.0,Placed
3,52.0,56.0,52.0,Not Placed
4,73.6,85.8,73.3,Placed


In [81]:
# Map the status column
data['status'] = data['status'].map({'Placed':1, 'Not Placed': 0})
data.head()

Unnamed: 0,hsc_p,ssc_p,degree_p,status
0,91.0,67.0,58.0,1
1,78.33,79.33,77.48,1
2,68.0,65.0,64.0,1
3,52.0,56.0,52.0,0
4,73.6,85.8,73.3,1


In [82]:
# Plot the data 
import plotly.express as px
fig = px.scatter_3d(data, x=data['hsc_p'], y=data['ssc_p'], z=data['degree_p'],
              color=data['status'].astype('str'))
fig.update_traces(marker=dict(size=12, line=dict(width=2, color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()

In [83]:
# Normalization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
data.iloc[:, 0:3] = sc.fit_transform(data.iloc[:, 0:3])
data.head()

Unnamed: 0,hsc_p,ssc_p,degree_p,status
0,2.268812,-0.028087,-1.140102,1
1,1.103448,1.113369,1.513267,1
2,0.153313,-0.213238,-0.322843,1
3,-1.318339,-1.046417,-1.957362,0
4,0.668391,1.712332,0.943909,1


In [84]:
# Calculate covariance matrix
covariance_matrix = np.cov([data.iloc[:,0], data.iloc[:,1], data.iloc[:,2]])
covariance_matrix

array([[1.0046729 , 0.51386216, 0.4362348 ],
       [0.51386216, 1.0046729 , 0.54091991],
       [0.4362348 , 0.54091991, 1.0046729 ]])

In [85]:
# Calculate eigen values and eigen vectors
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)

In [86]:
eigen_values

array([1.99996516, 0.57022845, 0.44382508])

In [87]:
eigen_vectors

array([[ 0.56011754,  0.75805521,  0.33409674],
       [ 0.59975742, -0.09287355, -0.7947739 ],
       [ 0.57145375, -0.6455438 ,  0.50666934]])

In [88]:
# Select the principle compenent(PC) from eigen vectors
pc = eigen_vectors[0:2]
pc

array([[ 0.56011754,  0.75805521,  0.33409674],
       [ 0.59975742, -0.09287355, -0.7947739 ]])

In [89]:
# dot product of data and pc
transformed_data = np.dot(data.iloc[:,0:3], pc.T)
# Create new data after transformed
new_data = pd.DataFrame(transformed_data, columns=['PC1', 'PC2'])
new_data['status'] = data['status'].values
new_data.head()

Unnamed: 0,PC1,PC2,status
0,0.868606,2.269469,1
1,1.967633,-0.644306,1
2,-0.183634,0.368342,1
3,-2.185615,0.862161,0
4,1.987777,-0.508352,1


In [90]:
# Plot
fig = px.scatter(x=new_data['PC1'], y=new_data['PC2'], color=new_data['status'].astype('str'))
fig.update_traces(marker=dict(size=12, line=dict(width=2, color='DarkSlateGrey')), selector=dict(mode='markers'))
fig.show()