In [2]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
# imports 
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

#확률적 경사하강법을 이용한 선형 분류모델
from sklearn.linear_model import SGDClassifier 
#svm 서포트벡터머신 (svc:분류 / svr: 회귀)
from sklearn.svm import SVC

First we are going to charge the data and take a peek to guess what is the situation


In [4]:
train_path = "/kaggle/input/titanic/train.csv"
test_path  = "/kaggle/input/titanic/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [5]:
train.head(5)

In [16]:
#Some info about the data
train.info()

print('----------------------------------------')
print('"NA 비율')
print('----------------------------------------')
p = (train.isna().sum()/len(train)*100).sort_values(ascending=False)
print(p)
print('----------------------------------------')
print('duplications 및 other useful info에 대한 고유값')
print('----------------------------------------')
u = train.nunique().sort_values()
print(u)



#### Missing values:
* Case 1: **'Cabin'** 77% of missing values. 데이터가 3/4이 누락되어 있는 데이터를 모방할 수도 있기 때문에 이 데이터는 삭제한다.

* Case 2: **'Age'** with 20% of missing values. 

* Case 3: **'Embarked'** with 0.2% of missing values. 


#### Categorical values
인코딩하거나 삭제해야하는 범주형 변수

* Case 1: **'Sex'** 값이 2개뿐이면 레이블 인코더를 할 수 있다.

* Case 2: **'Name'** 유용한 정보를 제공하지 않으므로 drop한다.

* Case 3: **'Ticket'** 유용한 정보를 제공하지 않으므로 drop한다.

* Case 4: **'Cabin'** drop by missing 70% of values, 결측치가 너무 많이 나옴.

* Case 5: **'Embarked'** has 3 possible values. 

In [17]:
# Embarked 메뉴얼 repacement 수치 확인
train['Embarked'].value_counts()

In [21]:
def cleanData(data):

    #Missing valuse case2
    data['Age'] = data.groupby(['Pclass','Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    
    
    data['Fare'] = data.groupby(['Pclass','Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))

    # Data missing Case3
    data.dropna(axis=0, subset=['Embarked'], inplace=True)
    
    #라벨 인코딩
    le = preprocessing.LabelEncoder()
    
    #성별
    data['Sex'].replace({'male':0, 'female':1}, inplace=True)
    
    #Embarked
    data['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace=True)
    
    return data

In [22]:
clean_train = cleanData(train)
clean_test = cleanData(test)

#### Check cleaning

데이터 전처리 확인

In [10]:
clean_train.info()
clean_test.info()

#### Modeling



In [11]:
# Set X and y
y = train['Survived']
X = pd.get_dummies(train.drop('Survived', axis=1))

# Split model train test data
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=42)


In [12]:
def fitAndPredict(model):
    #적합성 및 정확도 프로세스 자동화
    
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    return accuracy_score(y_val, prediction)

In [23]:
#여러모델 비교
model1 = LogisticRegression(solver='liblinear', random_state=42)
model2 = GradientBoostingClassifier()
model3 = RandomForestClassifier()
model4 = SGDClassifier()
model5 = SVC()

models = [model1, model2, model3, model4, model5]
i = 0
for model in models:
    i +=1
    print("Model ", i,":", model)
    print("ACC: ", fitAndPredict(model))

In [14]:
#가장 성능이 좋았던 GradientBoost 모델 매개변수 조절
model = GradientBoostingClassifier(min_samples_split=20, min_samples_leaf=60, max_depth=3, max_features=7)
fitAndPredict(model)

In [24]:
# 파일 저장
predict = model2.predict(pd.get_dummies(clean_test))

output = pd.DataFrame({'PassengerId': clean_test.PassengerId, 'Survived': predict})
output.to_csv('my_submission.csv', index=False)
print("Submission saved")