# Dimension Reduction using Singular value decomposition(SVD)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

In [2]:
df = pd.read_excel(r'D:\Data Scientist\Dimension reduction\Principle component analysis(PCA)\Dataset\University_Clustering.xlsx')

In [3]:
df

Unnamed: 0,UnivID,Univ,State,SAT,Top10,Accept,SFRatio,Expenses,GradRate
0,1,Brown,RI,1310.0,89,22,13.0,22704,94.0
1,2,CalTech,CA,1415.0,100,25,6.0,63575,81.0
2,3,CMU,PA,1260.0,62,59,9.0,25026,72.0
3,4,Columbia,NY,1310.0,76,24,12.0,31510,
4,5,Cornell,NY,1280.0,83,33,13.0,21864,90.0
5,6,Dartmouth,NH,1340.0,89,23,10.0,32162,95.0
6,7,Duke,NC,1315.0,90,30,12.0,31585,95.0
7,8,Georgetown,DC,,74,24,12.0,20126,92.0
8,9,Harvard,MA,1400.0,91,14,11.0,39525,97.0
9,10,JohnsHopkins,MD,1305.0,75,44,7.0,58691,87.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   UnivID    25 non-null     int64  
 1   Univ      25 non-null     object 
 2   State     25 non-null     object 
 3   SAT       24 non-null     float64
 4   Top10     25 non-null     int64  
 5   Accept    25 non-null     int64  
 6   SFRatio   24 non-null     float64
 7   Expenses  25 non-null     int64  
 8   GradRate  24 non-null     float64
dtypes: float64(3), int64(4), object(2)
memory usage: 1.9+ KB


In [5]:
df.describe()

Unnamed: 0,UnivID,SAT,Top10,Accept,SFRatio,Expenses,GradRate
count,25.0,24.0,25.0,25.0,24.0,25.0,24.0
mean,13.0,1266.916667,76.48,39.2,12.708333,27388.0,86.666667
std,7.359801,110.663578,19.433905,19.727308,4.154402,14424.883165,9.24858
min,1.0,1005.0,28.0,14.0,6.0,8704.0,67.0
25%,7.0,1236.25,74.0,24.0,10.75,15140.0,80.75
50%,13.0,1287.5,81.0,36.0,12.0,27553.0,90.0
75%,19.0,1345.0,90.0,50.0,14.25,34870.0,94.0
max,25.0,1415.0,100.0,90.0,25.0,63575.0,97.0


In [6]:
df1 = df.drop(['UnivID'],axis = 1)

In [7]:
df1

Unnamed: 0,Univ,State,SAT,Top10,Accept,SFRatio,Expenses,GradRate
0,Brown,RI,1310.0,89,22,13.0,22704,94.0
1,CalTech,CA,1415.0,100,25,6.0,63575,81.0
2,CMU,PA,1260.0,62,59,9.0,25026,72.0
3,Columbia,NY,1310.0,76,24,12.0,31510,
4,Cornell,NY,1280.0,83,33,13.0,21864,90.0
5,Dartmouth,NH,1340.0,89,23,10.0,32162,95.0
6,Duke,NC,1315.0,90,30,12.0,31585,95.0
7,Georgetown,DC,,74,24,12.0,20126,92.0
8,Harvard,MA,1400.0,91,14,11.0,39525,97.0
9,JohnsHopkins,MD,1305.0,75,44,7.0,58691,87.0


In [8]:
# Null values

df1.isna().sum()

Univ        0
State       0
SAT         1
Top10       0
Accept      0
SFRatio     1
Expenses    0
GradRate    1
dtype: int64

In [None]:
# WE USE PIPELINE HERE

In [9]:
from sklearn.pipeline import make_pipeline

In [10]:
svd = TruncatedSVD(n_components = 5)

In [11]:
num_features =df1.select_dtypes(exclude = ['object']).columns

In [12]:
num_features

Index(['SAT', 'Top10', 'Accept', 'SFRatio', 'Expenses', 'GradRate'], dtype='object')

In [13]:
num_pipeline = make_pipeline(SimpleImputer(strategy = 'mean'),StandardScaler(),svd)

In [14]:
processed = num_pipeline.fit(df1[num_features])

In [15]:
processed

In [16]:
import joblib

In [17]:
joblib.dump(processed,'Dim_Red_svd')

['Dim_Red_svd']

In [18]:
import os
os.getcwd()

'C:\\Users\\Administrator'

In [19]:
model = joblib.load('Dim_Red_svd')

In [20]:
model

In [21]:
svd_res = pd.DataFrame(model.transform(df1[num_features]))

In [22]:
svd_res

Unnamed: 0,0,1,2,3,4
0,-1.008824,-1.069154,0.078815,0.05332,-0.124905
1,-2.822327,2.25232,0.844907,0.13634,-0.091849
2,1.112562,1.626748,-0.25696,1.080296,-0.175069
3,-0.683955,0.031322,0.150691,-0.081396,-0.62766
4,-0.310925,-0.640049,0.006737,0.165489,0.018108
5,-1.695867,-0.348666,-0.255731,0.009171,-0.055605
6,-1.245805,-0.496317,-0.04313,-0.218671,0.29079
7,-0.389018,-0.782644,-0.464465,0.064073,-0.549292
8,-2.373384,-0.392515,0.112782,-0.452592,-0.243764
9,-1.402687,2.116496,-0.44603,-0.641238,0.218553


In [None]:
# SVD for feature extraction

In [28]:
final = pd.concat([df.Univ, svd_res.iloc[:, 0:3]], axis = 1)
final.columns = ['Univ', 'svd0', 'svd1', 'svd2']
final

Unnamed: 0,Univ,svd0,svd1,svd2
0,Brown,-1.008824,-1.069154,0.078815
1,CalTech,-2.822327,2.25232,0.844907
2,CMU,1.112562,1.626748,-0.25696
3,Columbia,-0.683955,0.031322,0.150691
4,Cornell,-0.310925,-0.640049,0.006737
5,Dartmouth,-1.695867,-0.348666,-0.255731
6,Duke,-1.245805,-0.496317,-0.04313
7,Georgetown,-0.389018,-0.782644,-0.464465
8,Harvard,-2.373384,-0.392515,0.112782
9,JohnsHopkins,-1.402687,2.116496,-0.44603
