In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
df = pd.read_csv('../data/train.csv')

In [14]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [16]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Null값 수정Permalink
Age: nan값 평균값으로
Cabin: nan값 ‘N’으로 변경
Embarked: nan값 ‘N’으로 변경

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Cabin'].fillna('N', inplace=True)
df['Embarked'].fillna('N', inplace=True)

In [None]:
print("null값 :", df.isnull().sum().sum())

In [None]:
df.info()

문자열 컬럼 수정Permalink
Name, Ticket: 안씀
Sex, Cabin, Embarked: 수정

In [None]:
df['Sex'].value_counts() # Nan은 체크하지 않으므로 미리 처리해야함
# 바로 숫자로 변경가능

In [None]:
df['Cabin'].value_counts()
# 첫문자만 남기고 제거해야함

In [None]:
df['Embarked'].value_counts()
# 바로 숫자로 변경가능

In [None]:
# cabin의 첫글자만 남기기
df['Cabin'] = df['Cabin'].str[:1]

In [None]:
df.groupby(['Sex', 'Survived'])['Survived'].count()

In [None]:
sns.barplot(data=df, x='Sex', y='Survived', palette='pastel')

In [None]:
sns.barplot(data=df, x='Pclass', y='Survived', palette='hls')

In [None]:
ax = sns.barplot(data=df, x='Pclass', y='Survived', hue='Sex', palette='husl', errwidth=0)
plt.grid()
for p in ax.containers:
    ax.bar_label(p,)

In [None]:
df['Age'].value_counts()
# 나잇대 그룹으로 묶기

In [None]:
def get_category(age):
    cat = ''
    if age <= -1: cat = 'Unknown'
    elif age <= 5: cat = 'Baby'
    elif age <= 12: cat = 'Child'
    elif age <= 18: cat = 'Teenager'
    elif age <= 25: cat = 'Student'
    elif age <= 35: cat = 'Young Adult'
    elif age <= 60: cat = 'Adult'
    else: cat = 'Elderly'
    
    return cat

In [None]:
df['Age_cat'] = df['Age'].apply(lambda x : get_category(x))

In [None]:
plt.figure(figsize=(10, 5))
group_name = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Elderly']
sns.barplot(data=df, x='Age_cat', y='Survived', hue='Sex', order=group_name, palette='hls')
plt.legend(loc='upper left')

In [None]:
df.drop('Age_cat', axis=1, inplace=True)

### 머신러닝

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
def encode_features(dataDF):
    features = ['Sex', 'Cabin', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        dataDF[feature] = le.fit_transform(dataDF[feature])
        print(le.classes_) # label 정보 저장
    return dataDF

In [None]:
df = encode_features(df)

In [None]:
df.head(2)

In [None]:
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna('0', inplace=True)
    return df

# 불필요한 feature 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩
def format_features(df):
    from sklearn.preprocessing import LabelEncoder
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Sex', 'Cabin', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
        print(le.classes_)
    return df

# 데이터 전처리 함수 전체 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [None]:
df = pd.read_csv('titanic.csv')
y = df['Survived']
X = df.drop(columns=['Survived'])

X = transform_features(X)

In [None]:
X.head(2)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier # tree가 여러개
from sklearn.linear_model import LogisticRegression # 이진분류에 사용(이름은 회귀지만 분류 알고리즘)
from sklearn.metrics import accuracy_score

In [None]:
# 의사결정트리, 랜덤포레스트, 로지스틱회귀를 위한 사이킷런 Classifier 클래스 생성
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression(solver='liblinear') # fit_intercept=True: y절편을 고정하지 않고 찾음

In [None]:
# DesisionTreeClassifier 학습/예측/평가
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print("의사결정트리 정확도 :", accuracy_score(y_test, dt_pred))

In [None]:
# RandomForestClassifier 학습/예측/평가
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print("랜덤포레스트 정확도 :", accuracy_score(y_test, rf_pred))

In [None]:
# LogisticRegression 학습/예측/평가
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print("로지스틱회귀 정확도 :", accuracy_score(y_test, lr_pred))

### 교차 검증

In [None]:
from sklearn.model_selection import KFold

In [None]:
def exec_kfold(clf, folds=5):
    kfold = KFold(n_splits=folds, shuffle=False)
    scores = []
    
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X)):
        # X_train, X_test = X.values[train_index], X.values[test_index]
        # y_train, y_test = y.values[train_index], y.values[test_index]
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        # Classifier 학습, 예측, 정확도 계산
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        scores.append(accuracy)
        print(f"{iter_count+1}차 교차 검증 정확도 : {accuracy:.4f}")
    
    mean_score = np.mean(scores)
    print(f"평균 정확도 : {mean_score:.4f}")

In [None]:
exec_kfold(dt_clf, folds=5)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(dt_clf, X, y, cv=5)

In [None]:
for iter_count, accuracy in enumerate(scores):
    print(f"{iter_count+1}차 교차 검증 정확도 : {accuracy:.4f}")
    
print(f"평균 정확도 : {np.mean(scores):.4f}") # cross_val_score()는 StratifiedKFold를 사용하기에 KFold와는 결과가 다름

### GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param = {
    'max_depth':[2, 3, 5, 10],
    'min_samples_split':[2, 3, 5],
    'min_samples_leaf':[1, 5, 8] # root node (트리 최상위), leaf node(트리 최말단) # leaf node가 되기 위한 최소 샘플 수 # defaut 1
}

In [None]:
grid = GridSearchCV(dt_clf, param, cv=5, scoring='accuracy') # 경우의 수: 4*3*3*5
grid.fit(X_train, y_train)

In [None]:
pred = grid.predict(X_test)

In [None]:
grid.best_params_