<a href="https://colab.research.google.com/github/MpRonald/Machine-Learning/blob/main/PCA_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# imports 
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
census = pd.read_csv('https://raw.githubusercontent.com/MpRonald/datasets/main/census.csv')
census.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Preprocessing

In [3]:
X_census = census.iloc[:,0:14].values
y_census = census.iloc[:,14].values

In [4]:
# label encoder
lb = LabelEncoder()
lb_work = lb
lb_education = lb
lb_marital = lb
lb_occupation = lb
lb_relationship = lb
lb_race = lb
lb_sex = lb
lb_country = lb

In [5]:
X_census[:,1] = lb_work.fit_transform(X_census[:,1])
X_census[:,3] = lb_education.fit_transform(X_census[:,3])
X_census[:,5] = lb_marital.fit_transform(X_census[:,5])
X_census[:,6] = lb_occupation.fit_transform(X_census[:,6])
X_census[:,7] = lb_relationship.fit_transform(X_census[:,7])
X_census[:,8] = lb_race.fit_transform(X_census[:,8])
X_census[:,9] = lb_sex.fit_transform(X_census[:,9])
X_census[:,13] = lb_country.fit_transform(X_census[:,13])

In [6]:
scaler = StandardScaler()
X_census = scaler.fit_transform(X_census)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_census, y_census, test_size=0.2, random_state=123)

# PCA (Principal Component Analysis)

In [8]:
from sklearn.decomposition import PCA

In [9]:
# this technique join the attributes in 6 components
pca = PCA(n_components=10)

In [10]:
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [11]:
X_train_pca.shape, X_test_pca.shape

((26048, 10), (6513, 10))

In [12]:
# bellow we have percent about the 6 attributes
pca.explained_variance_ratio_

array([0.15203426, 0.10050429, 0.0901517 , 0.08020869, 0.07709463,
       0.07345715, 0.06766553, 0.0665213 , 0.06112587, 0.05948296])

In [13]:
# for example, bellow we sum the 6 components and we have 
# the percent about attributes explained
pca.explained_variance_ratio_.sum()

0.8282463844456937

In [14]:
random = RandomForestClassifier(n_estimators=40, random_state=123).fit(X_train_pca, y_train)

In [15]:
y_pred = random.predict(X_test_pca)

In [16]:
accuracy_score(y_test, y_pred)

0.844311377245509

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       <=50K       0.87      0.94      0.90      4927
        >50K       0.75      0.55      0.63      1586

    accuracy                           0.84      6513
   macro avg       0.81      0.74      0.77      6513
weighted avg       0.84      0.84      0.84      6513



# PCA Kernel

In [18]:
from sklearn.decomposition import KernelPCA

In [19]:
k_pca = KernelPCA(n_components=6, kernel='rbf')

In [20]:
X_train_kernel = k_pca.fit_transform(X_train)

In [21]:
X_test_kernel = k_pca.transform(X_test)

In [22]:
X_train_kernel.shape

(26048, 6)

In [23]:
X_test_kernel.shape

(6513, 6)

In [37]:
random = RandomForestClassifier(n_estimators=30, criterion='entropy',random_state=123).fit(X_train_kernel, y_train)

In [38]:
y_pred = random.predict(X_test_kernel)

In [39]:
accuracy_score(y_test, y_pred)

0.8225088284968525

In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       <=50K       0.85      0.92      0.89      4927
        >50K       0.68      0.51      0.58      1586

    accuracy                           0.82      6513
   macro avg       0.77      0.72      0.74      6513
weighted avg       0.81      0.82      0.81      6513



# LDA (Linear Discriminat Analysis)

In [41]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [46]:
lda = LinearDiscriminantAnalysis(n_components=1)

In [47]:
X_train_lda = lda.fit_transform(X_train, y_train)
x_test_lda = lda.transform(X_test)

In [48]:
random = RandomForestClassifier(n_estimators=30, criterion='entropy',random_state=123).fit(X_train_lda, y_train)

In [49]:
y_pred = random.predict(x_test_lda)

In [50]:
accuracy_score(y_test, y_pred)

0.740672501151543

In [51]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       <=50K       0.83      0.83      0.83      4927
        >50K       0.47      0.45      0.46      1586

    accuracy                           0.74      6513
   macro avg       0.65      0.64      0.64      6513
weighted avg       0.74      0.74      0.74      6513

