## Lab 12-Part2: PCA

In [32]:
import pandas as pd
import scipy as sp
import numpy as np
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
import time

## 0. Data Loading

In [33]:
#load data and extract data
names = ['age', 'workclass', 'fnlwgt', 'edu', 'edu-num', 'maritalstatus', 
         'occupation', 'relationship', 'race', 'sex','capital-gain',
         'capital-loss','hours-per-week','native-country','income']
df = pd.read_csv('adult.data', names=names)
print( "Total columns: ", len(df.columns))
df.head()

Total columns:  15


Unnamed: 0,age,workclass,fnlwgt,edu,edu-num,maritalstatus,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   edu             32561 non-null  object
 4   edu-num         32561 non-null  int64 
 5   maritalstatus   32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [35]:
from sklearn import preprocessing

# it is required that all feature/target values be numerical
# Systematically convert all string (labeled as object) type into labels(1,2,3,...)
label_encoding = preprocessing.LabelEncoder()
for column_name in df.columns:
    if df[column_name].dtype == object:
        df[column_name] = label_encoding.fit_transform(df[column_name])
    else:
        pass

In [36]:
# extract X, y
y = df['income']      # two labels: <=50K, >50K
X = df.drop('income', axis=1)

In [37]:
# Standardize X
X = StandardScaler().fit_transform(X)

## 1. Linear SVC with all features

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

print('Train/Test sizes:', y_train.shape[0], y_test.shape[0])

Train/Test sizes: 26048 6513


In [39]:
start_time = time.time()
linear_svc_clf = LinearSVC(C=1, loss='hinge', random_state =42)
linear_svc_clf.fit(X_train, y_train)

stop_time = time.time()

print(round(accuracy_score(linear_svc_clf.predict(X_test), y_test), 5), 
      round(stop_time - start_time, 3))

0.82328 0.356


### Important:  You should NOT modify code above this line, but you should understand the code

## 2. Linear SVC with reduced features

In [40]:
# Do basic setup for PCA with the number of components of 5
# fit and transforme X into a transformed X (so transformed_x will have five features instead of 14)
from sklearn.decomposition import PCA

from sklearn.decomposition import PCA
pca= PCA(n_components=5)
X_pca=pca.fit_transform(X)

In [41]:
# print out the shape of transformed X (should be the shape of 32561 X 5)
print(X_pca.shape)


(32561, 5)


In [42]:
# print out the first five componoments (should be the shape of 14 X 5)
print(pca.components_)


[[ 0.28541694  0.21065365 -0.04676461  0.10302999  0.21834551 -0.32234576
   0.16541888 -0.52103456  0.15758699  0.45929585  0.14064915  0.11113281
   0.37618321  0.0387983 ]
 [ 0.12631099 -0.09547879  0.12626212 -0.60532428 -0.60940705 -0.03001548
  -0.11388697 -0.23351082 -0.02222681  0.29338617 -0.15222615 -0.06395512
  -0.05992553 -0.18807097]
 [ 0.39600735 -0.53025939 -0.19163503  0.13205233  0.02364487 -0.27321448
  -0.59239198 -0.04436818  0.1155245  -0.08569312  0.02142513  0.0614523
  -0.10388549  0.20981541]
 [ 0.19336557 -0.12870247  0.15209818  0.08687924  0.13749713 -0.10846346
  -0.0761857   0.01541317 -0.63313621 -0.0550659   0.24325401  0.03864584
   0.09825907 -0.63595393]
 [-0.39845542 -0.33240602  0.51622212  0.24535056  0.13728109  0.34367476
  -0.26585163 -0.2353043   0.05315744  0.35952005 -0.07330746  0.03787648
   0.04646959 -0.00229979]]


In [43]:
# print out the exlained variances of the first five componoments
print(pca.explained_variance_)


[2.11959788 1.41186323 1.25900839 1.12373426 1.07314576]


In [44]:
# split your training/testing dataset based on reduced X
X_pca_train, X_pca_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.20, random_state=1)

In [48]:
# train the same model using the reduced dataset
# print out the prediction accuracy and execution


start_time = time.time()
linear_svc_clf = LinearSVC(C=1, loss='hinge', random_state =42)
linear_svc_clf.fit(X_pca_train, y_train)
stop_time = time.time()

print('Accuracy:', round(accuracy_score(linear_svc_clf.predict(X_pca_test), y_test), 5))
print('Time:', round(stop_time - start_time, 3))

Accuracy: 0.809
Time: 0.07
