In [221]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [222]:
%cd C:/Users/USER/Desktop/Project/Tiranic

C:\Users\USER\Desktop\Project\Tiranic


# 데이터 불러오기

In [223]:
train = pd.read_csv("./Data/train.csv", index_col = 'PassengerId')
test = pd.read_csv("./Data/test.csv", index_col = 'PassengerId')

# feature engineering

In [224]:
def feature_engineering(data) :
    # Name에서 Initial 추출
    data['Initial'] = 0
    data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\.')
    
    # Initial 변수 축소
    data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
                            ['Miss','Miss','Miss','Mr','Mrs','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace = True)
    data['Initial'].replace(['Rev','Col','Dr','Dona','Ms'],
                            ['Other','Other','Mr','Mr',"Miss"],inplace=True)
    
    # Initial 변수의 평균으로 Age 결측치 처리
    data.loc[(data['Age'].isna())&(data['Initial']=='Mr'),'Age'] = 33
    data.loc[(data['Age'].isna())&(data['Initial']=='Mrs'),'Age'] = 36
    data.loc[(data['Age'].isna())&(data['Initial']=='Master'),'Age'] = 5
    data.loc[(data['Age'].isna())&(data['Initial']=='Miss'),'Age'] = 22
    data.loc[(data['Age'].isna())&(data['Initial']=='Other'),'Age'] = 46
    
    
    # Cabin 결측치와 관측된 값으로 나누기
    data.loc[~data['Cabin'].isna(), 'Cabin'] = 'T'
    data['Cabin'].fillna('N', inplace=True)
    
    # Embarked 결측치 실제 데이터 대체
    data.loc[data['Embarked'].isna(), 'Embarked'] = 'S'
    
    # Fare / 동일한 Ticket 개수로 나누기 
    ticket_grouped = data.groupby('Ticket')['Fare'].count().reset_index()
    ticket_grouped.rename(columns={'Fare': 'PassengerCount'}, inplace=True)
    
    data = data.merge(ticket_grouped, on='Ticket', how='left')
    data['FarePerPassenger'] = data['Fare'] / data['PassengerCount']

    # SibSp, Parch, PassengerCount를 합쳐 동승자 변수 생성하여 동승자 여부가 생존에 미치는 영향 판단
    data['Fellow_passenger'] = data['SibSp'] + data['Parch'] + data['PassengerCount']
    data['Alone'] = 1
    data.loc[data['Fellow_passenger'] > 1,'Alone'] = 0

    # 필요없는 변수 제거
    data.drop(['Name', 'Ticket', 'Fare','SibSp','Parch', 'PassengerCount','Fellow_passenger'], axis = 1, inplace = True)
    
    return data

In [225]:
train = feature_engineering(train)

In [226]:
test = feature_engineering(test)

In [227]:
# test 데이터의 FarePerPassenger 결측치 처리
test.loc[test['FarePerPassenger'].isna(),'FarePerPassenger'] = train[(train['Initial'] == 'Mr') & (train['Pclass'] == 3)]['FarePerPassenger'].mean()

# Data Preprocessing

In [228]:
x_train = train.drop('Survived', axis = 1)
y_train = train['Survived']

In [229]:
num_col = ['Age', 'FarePerPassenger']
str_col = ['Sex', 'Cabin', 'Embarked', 'Initial']
cat_col = ['Pclass', 'FamilySize']

In [230]:
from sklearn.preprocessing import LabelEncoder

In [231]:
def preprocessing(data) :
    for col in str_col :
        le = LabelEncoder()
        le.fit(x_train[col])
        x_train[col] = le.transform(x_train[col])
        test[col] = le.transform(test[col])
        
    for num in num_col :
        max = x_train[num].max()
        min = x_train[num].min()
        x_train[num] = (x_train[num] - min) / (max - min)
        test[num] = (test[num] - min) / (max - min)
    
    return data

In [232]:
x_train = preprocessing(x_train)
test = preprocessing(test)

In [233]:
print(x_train.isna().sum())
print(x_train.shape)
print("----------------------")
print(test.isna().sum())
print(test.shape)

Pclass              0
Sex                 0
Age                 0
Cabin               0
Embarked            0
Initial             0
FarePerPassenger    0
Alone               0
dtype: int64
(891, 8)
----------------------
Pclass              0
Sex                 0
Age                 0
Cabin               0
Embarked            0
Initial             0
FarePerPassenger    0
Alone               0
dtype: int64
(418, 8)


# 모델링 

In [234]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

In [235]:
kfold = StratifiedKFold(n_splits=10)

random_state = 2023
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(LogisticRegression(random_state = random_state))

cv_results = []
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, x_train, y = y_train, scoring = "accuracy", cv = kfold, n_jobs=-1))

cv_means = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())

cv_res = pd.DataFrame({"Algorithm":["SVC","DecisionTree","RandomForest","GradientBoosting","LogisticRegression"],
                       "CrossValMeans":cv_means})

cv_res

Unnamed: 0,Algorithm,CrossValMeans
0,SVC,0.81593
1,DecisionTree,0.78563
2,RandomForest,0.799114
3,GradientBoosting,0.835019
4,LogisticRegression,0.791248


In [236]:
GB = GradientBoostingClassifier(random_state=random_state)

cross_val_score(GB, x_train, y = y_train, scoring = "accuracy", cv = kfold, n_jobs=-1).mean()

0.8350187265917602

In [238]:
# 최적의 하이퍼파라미터 찾기

parameters = {
    "learning_rate": [0.1, 0.15, 0.2, 0.5],
    'min_samples_leaf' : range(1,32),
    "max_depth":[1,2,4,8],
    "subsample":[0.5, 0.65,0.8],
    "n_estimators":[10, 50, 100]
    }

grid_search = GridSearchCV(GradientBoostingClassifier(random_state = random_state), parameters, cv = 10, n_jobs=-1)
grid_search.fit(x_train, y_train)
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Hyperparameters:  {'learning_rate': 0.15, 'max_depth': 4, 'min_samples_leaf': 20, 'n_estimators': 100, 'subsample': 0.5}
Best Score:  0.8417727840199751


In [239]:
GB = GradientBoostingClassifier(random_state=42, **grid_search.best_params_)
GB.fit(x_train, y_train)

In [240]:
GB.score(x_train, y_train)

0.8888888888888888

In [241]:
Survived_pred = GB.predict(test)

In [242]:
Survived_pred_proba = GB.predict_proba(test)

# 모델 평가

In [243]:
Sub = pd.read_csv("./Data/gender_submission.csv", index_col = 'PassengerId')

In [244]:
Sub['Survived'] = Survived_pred

In [245]:
Sub['Survived'].to_csv('./submisson.csv')