In [1]:
# -*- coding: utf-8 -*-

### 기본 라이브러리 불러오기
import pandas as pd
import seaborn as sns

In [2]:
# titanic
# load_dataset 함수를 사용하여 데이터프레임으로 변환

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [7]:
#  IPython 디스플레이 설정 - 출력할 열의 개수 한도 늘리기
pd.set_option('display.max_columns', 15)
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [8]:
# NaN값이 많은 deck 열을 삭제, embarked와 내용이 겹치는 embark_town 열을 삭제.

rdf = df.drop(['deck','embark_town'], axis = 1)
print(rdf.columns.values)

['survived' 'pclass' 'sex' 'age' 'sibsp' 'parch' 'fare' 'embarked' 'class'
 'who' 'adult_male' 'alive' 'alone']


In [9]:
# age 열에 나이 데이터가 없는 모든 행을 삭제 - age 열(891개 중 177개의 NaN 값)
rdf = df.dropna(subset = ['age'], how = 'any', axis = 0)
len(rdf)

714

In [None]:
# how : {'any', 'all'}, default 'any'
#     Determine if row or column is removed from DataFrame, when we have
#     at least one NA or all NA.

#     * 'any' : If any NA values are present, drop that row or column.
#     * 'all' : If all values are NA, drop that row or column.

In [10]:
rdf['embarked'].value_counts(dropna = True)

S    554
C    130
Q     28
Name: embarked, dtype: int64

In [11]:
most_freq = rdf['embarked'].value_counts(dropna = True).idxmax()

In [14]:
rdf.describe(include = 'all') # 수치형 데이터만 보임

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
count,714.0,714.0,714,714.0,714.0,714.0,714.0,714,714,714,714,184,712,714,714
unique,,,2,,,,,3,3,3,2,7,3,2,2
top,,,male,,,,,S,Third,man,True,C,Southampton,no,True
freq,,,453,,,,,556,355,413,413,51,554,424,404
mean,0.406162,2.236695,,29.699118,0.512605,0.431373,34.694514,,,,,,,,
std,0.49146,0.83825,,14.526497,0.929783,0.853289,52.91893,,,,,,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,,,,,,
25%,0.0,1.0,,20.125,0.0,0.0,8.05,,,,,,,,
50%,0.0,2.0,,28.0,0.0,0.0,15.7417,,,,,,,,
75%,1.0,3.0,,38.0,1.0,1.0,33.375,,,,,,,,


In [13]:
rdf.embarked.fillna(most_freq, inplace = True)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     714 non-null    int64   
 1   pclass       714 non-null    int64   
 2   sex          714 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        714 non-null    int64   
 5   parch        714 non-null    int64   
 6   fare         714 non-null    float64 
 7   embarked     714 non-null    object  
 8   class        714 non-null    category
 9   who          714 non-null    object  
 10  adult_male   714 non-null    bool    
 11  deck         184 non-null    category
 12  embark_town  712 non-null    object  
 13  alive        714 non-null    object  
 14  alone        714 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 70.2+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [15]:
ndf = rdf[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked']]

In [16]:
# 원핫인코딩 - 범주형 데이터를 모형이 인식할 수 있도록 숫자형으로 변환
gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf, gender], axis = 1)

In [17]:
onehot_embarked = pd.get_dummies(ndf['embarked'], prefix = 'town')
ndf = pd.concat([ndf, onehot_embarked], axis = 1)

In [19]:
ndf.drop(['sex','embarked'], axis = 1, inplace  = True)
ndf.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,female,male,town_C,town_Q,town_S
0,0,3,22.0,1,0,0,1,0,0,1
1,1,1,38.0,1,0,1,0,1,0,0
2,1,3,26.0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,1,0,0,0,1
4,0,3,35.0,0,0,0,1,0,0,1


In [20]:
ndf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  714 non-null    int64  
 1   pclass    714 non-null    int64  
 2   age       714 non-null    float64
 3   sibsp     714 non-null    int64  
 4   parch     714 non-null    int64  
 5   female    714 non-null    uint8  
 6   male      714 non-null    uint8  
 7   town_C    714 non-null    uint8  
 8   town_Q    714 non-null    uint8  
 9   town_S    714 non-null    uint8  
dtypes: float64(1), int64(4), uint8(5)
memory usage: 37.0 KB


In [21]:
# 데이터셋 구분 - 훈련용(train data)/ 검증용(test data)
# 속성(변수) 선택

x=ndf[['pclass', 'age', 'sibsp', 'parch', 'female', 'male', 'town_C', 'town_Q', 'town_S']]  #독립 변수 X
y=ndf['survived']                      #종속 변수 Y

In [22]:
# 설명 변수 데이터를 정규화(normalization)
from sklearn.preprocessing import StandardScaler

x = StandardScaler().fit(x).transform(x)

In [23]:
# train data 와 test data로 구분(7:3 비율)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)

In [24]:
x_train, y_train, x_train.shape, y_train.shape

(array([[-1.47636364,  2.3629184 ,  0.52457013, ..., -0.47180795,
         -0.20203051,  0.53307848],
        [ 0.91123237, -0.46148866, -0.55170307, ..., -0.47180795,
         -0.20203051,  0.53307848],
        [ 0.91123237, -0.73704057, -0.55170307, ..., -0.47180795,
         -0.20203051,  0.53307848],
        ...,
        [-0.28256564, -0.32371271, -0.55170307, ..., -0.47180795,
         -0.20203051,  0.53307848],
        [-1.47636364,  1.4673747 , -0.55170307, ...,  2.11950647,
         -0.20203051, -1.87589641],
        [-1.47636364, -0.9437045 , -0.55170307, ...,  2.11950647,
         -0.20203051, -1.87589641]]),
 438    0
 753    0
 283    1
 292    0
 716    1
       ..
 461    0
 398    0
 666    0
 155    0
 329    1
 Name: survived, Length: 499, dtype: int64,
 (499, 9),
 (499,))

In [27]:
# KNN 분류 모형
from sklearn.neighbors import KNeighborsClassifier

In [28]:
# K = 5 설정
knn = KNeighborsClassifier(n_neighbors=5)

In [29]:
# train data로 학습
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [30]:
# test data로 y_hat 예측(분류)
knn.predict(x_test) # x_test 데이터로 입력

array([0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1], dtype=int64)

In [31]:
y_hat = knn.predict(x_test) # x_test 데이터 입력

In [32]:
# 모형 성능 평가
from sklearn import metrics
knn_metrics = metrics.confusion_matrix(y_test, y_hat)

In [34]:
# 평가 지표
knn_report = metrics.classification_report(y_test, y_hat)
print(knn_report)

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       125
           1       0.80      0.72      0.76        90

    accuracy                           0.81       215
   macro avg       0.81      0.80      0.80       215
weighted avg       0.81      0.81      0.81       215

