In [44]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

In [45]:
# 전처리
df = sns.load_dataset('titanic')
df_ti = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked','who']].copy()
df_ti.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who
886,0,2,male,27.0,0,0,13.0,S,man
887,1,1,female,19.0,0,0,30.0,S,woman
888,0,3,female,,1,2,23.45,S,woman
889,1,1,male,26.0,0,0,30.0,C,man
890,0,3,male,32.0,0,0,7.75,Q,man


In [46]:
df_ti.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
who           0
dtype: int64

In [47]:
# case 1
adult = df_ti[df_ti.who.isin(['man','woman'])]
child = df_ti[df_ti.who =='child']
df_ti.age.fillna(adult.age.mean().round(0),inplace=True)
df_ti.head(45).tail(15)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who
30,0,1,male,40.0,0,0,27.7208,C,man
31,1,1,female,33.0,1,0,146.5208,C,woman
32,1,3,female,33.0,0,0,7.75,Q,woman
33,0,2,male,66.0,0,0,10.5,S,man
34,0,1,male,28.0,1,0,82.1708,C,man
35,0,1,male,42.0,1,0,52.0,S,man
36,1,3,male,33.0,0,0,7.2292,C,man
37,0,3,male,21.0,0,0,8.05,S,man
38,0,3,female,18.0,2,0,18.0,S,woman
39,1,3,female,14.0,1,0,11.2417,C,child


In [48]:
df_ti.embarked.value_counts()

embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [49]:
df_ti.embarked.fillna('S',inplace=True)
df_ti.isna().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
who         0
dtype: int64

In [50]:
le = LabelEncoder()
df_ti.sex = le.fit_transform(df_ti.sex)
df_ti.embarked = le.fit_transform(df_ti.embarked)
df_ti = df_ti.iloc[:,:-1]
df_ti.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
886,0,2,1,27.0,0,0,13.0,2
887,1,1,0,19.0,0,0,30.0,2
888,0,3,0,33.0,1,2,23.45,2
889,1,1,1,26.0,0,0,30.0,0
890,0,3,1,32.0,0,0,7.75,1


In [51]:
X = df_ti.iloc[:,1:].values
y= df_ti.survived.values

In [52]:
X_std = StandardScaler().fit_transform(X)

In [53]:
def pca_accuracy(X,y):
    X_train, X_test, y_train, y_test = train_test_split(
        X,y,stratify=y,test_size=0.2,random_state=2023
    )
    rfc = RandomForestClassifier(random_state=2023)
    rfc.fit(X_train,y_train)
    return rfc.score(X_test,y_test)

In [55]:
for n in [2,3,4,5,6]:
    pca = PCA(n_components=n)
    wine_pca = pca.fit_transform(X_std)
    explained = sum(pca.explained_variance_ratio_)
    acc = pca_accuracy(wine_pca,y)
    print(f'PCA{n} : 설명력 = {explained:.4f}\t정확도 = {acc:.4f}')

PCA2 : 설명력 = 0.5095	정확도 = 0.7598
PCA3 : 설명력 = 0.6490	정확도 = 0.7430
PCA4 : 설명력 = 0.7687	정확도 = 0.7542
PCA5 : 설명력 = 0.8672	정확도 = 0.7709
PCA6 : 설명력 = 0.9472	정확도 = 0.7598
