<!-- 타이타닉 생존자 예측 문제
- 데이터의 결측치, 중복 변수값에 대해 처리하고
- 분류모델을 사용하여 Accuracy, F1 score, AUC 값을 산출하시오.
데이터 설명
survival : 0 = No, 1 = Yes
pclass : 객실 등급(1,2,3)
sex : 성별
age : 나이
sibsp : 타이타닉호에 탑승한 형제/배우자의 수
parch : 타이타닉호에 탑승한 부모/자녀의 수
fare : 요금
embarked : 탑승지 이름(C, Q, S) Cherbourg / Queenstown / Southampton
(중복)class : 객실 등급(First, Second, Third)
who : man, women, child
adult_male : 성인남자인지 여부(True=성인남자, False 그외)
deck : 선실번호 첫 알파벳(A,B,C,D,E,F,G)
(중복) embark_town : 탑승지 이름(Cherbourg, Queenstown, Southampton)
(중복) alive : 생존여부(no:사망, yes:생존)
alone : 혼자 탑승했는지 여부(True=혼자, False=가족과 함께) -->

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns

df = sns.load_dataset('titanic')
x = df.drop('survived', axis=1)
y = df['survived']

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=2023)

x_test = pd.DataFrame(x_test)
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

x_test.reset_index()
y_train.columns=['target']
y_test.columns=['target']
# 1. 라이브러리 및 데이터 확인
# print(x_train.head(3))
# print(x_test.head(3))
# print(y_train.head(3))

# print(x_train.info())
# print(x_test.info())
# print(y_train.info())
# object, category

# print(x_train.isnull().sum())
# print(x_test.isnull().sum())

# print(x_train.describe(include='category').T)
# print(x_test.describe(include='category').T)
#print(y_train.value_counts())

# 데이터 탐색(EDA)
x_train = x_train.drop(['class','deck','embark_town','alive'], axis=1)
x_test = x_test.drop(['class','deck','embark_town','alive'], axis=1)

# print(x_train.head(3))
# print(x_test.head(3))

# age, x_train['embarked']
# print(x_train.isnull().sum())
# print(x_test.isnull().sum())

x_train['age'] = x_train['age'].fillna(x_train['age'].median())
x_test['age'] = x_test['age'].fillna(x_train['age'].median())

x_train['embarked'] = x_train['embarked'].fillna(x_train['embarked'].mode()[0])

# print(x_train.isnull().sum())
# print(x_test.isnull().sum())

# 데이터 전처리 및 분리

x_train = pd.get_dummies(x_train, dtype=np.uint8)
x_test = pd.get_dummies(x_test, dtype=np.uint8)


# print(x_train.info())
# print(x_test.info())



# 모델링 및 성능평가

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=2023)

grid_param = {'n_estimators':[30,70,100],
              'max_depth':[1,5,8],
              'min_samples_leaf':[1,2,5]}

gridsearch = GridSearchCV(rf, param_grid = grid_param, cv=10)
gridsearch.fit(x_train, y_train['target'])

# print("최적 : ",gridsearch.best_params_)
# print("점수 : ",gridsearch.best_score_)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=30,
                               max_depth=8,
                               min_samples_leaf=5,
                               random_state=2023)

model.fit(x_train, y_train['target'])
y_result = model.predict(x_test)

# from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
# acc = accuracy_score(y_test,y_result)
# f1 = f1_score(y_test,y_result)
# auc = roc_auc_score(y_test, y_result)
# print(acc)
# print(f1)
# print(auc)
# acc 0.7988826815642458
# f1 0.7272727272727273
# auc 0.7796442687747035
# y_prob = model.predict_proba(x_test)

# result = pd.DataFrame({'result':y_result,
#                        'prob0':y_prob[:,0],
#                        'prob1':y_prob[:,1]})
# print(result[:5])

pd.DataFrame({'result':y_result}).to_csv('titanic_practice.csv',index=False)