In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
# imports 
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

#확률적 경사하강법을 이용한 선형 분류모델
from sklearn.linear_model import SGDClassifier 
#svm 서포트벡터머신 (svc:분류 / svr: 회귀)
from sklearn.svm import SVC

First we are going to charge the data and take a peek to guess what is the situation


In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')

In [5]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
#Some info about the data
train.info()

print('----------------------------------------')
print('"NA 비율')
print('----------------------------------------')
p = (train.isna().sum()/len(train)*100).sort_values(ascending=False)
print(p)
print('----------------------------------------')
print('duplications 및 other useful info에 대한 고유값')
print('----------------------------------------')
u = train.nunique().sort_values()
print(u)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
----------------------------------------
"NA 비율
----------------------------------------
Cabin          77.104377
Age            19.865320
Embarked        0.224467
PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name 



#### Missing values:
* Case 1: **'Cabin'** 77% of missing values. 데이터가 3/4이 누락되어 있는 데이터를 모방할 수도 있기 때문에 이 데이터는 삭제한다.

* Case 2: **'Age'** with 20% of missing values. 

* Case 3: **'Embarked'** with 0.2% of missing values. 


#### Categorical values
인코딩하거나 삭제해야하는 범주형 변수

* Case 1: **'Sex'** 값이 2개뿐이면 레이블 인코더를 할 수 있다.

* Case 2: **'Name'** 유용한 정보를 제공하지 않으므로 drop한다.

* Case 3: **'Ticket'** 유용한 정보를 제공하지 않으므로 drop한다.

* Case 4: **'Cabin'** drop by missing 70% of values, 결측치가 너무 많이 나옴.

* Case 5: **'Embarked'** has 3 possible values. 

In [7]:
# Embarked 메뉴얼 repacement 수치 확인
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [8]:
def cleanData(data):

    #Missing valuse case2
    data['Age'] = data.groupby(['Pclass','Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    
    
    data['Fare'] = data.groupby(['Pclass','Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))

    # Data missing Case3
    data.dropna(axis=0, subset=['Embarked'], inplace=True)
    
    #라벨 인코딩
    le = preprocessing.LabelEncoder()
    
    #성별
    data['Sex'].replace({'male':0, 'female':1}, inplace=True)
    
    #Embarked
    data['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace=True)
    
    return data

In [9]:
clean_train = cleanData(train)
clean_test = cleanData(test)

#### Check cleaning

데이터 전처리 확인

In [10]:
clean_train.info()
clean_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    int64  
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     889 non-null    int64  
dtypes: float64(2), int64(7), object(3)
memory usage: 90.3+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

#### Modeling



In [11]:
# Set X and y
y = train['Survived']
X = pd.get_dummies(train.drop('Survived', axis=1))

# Split model train test data
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=42)


In [12]:
def fitAndPredict(model):
    #적합성 및 정확도 프로세스 자동화
    
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    return accuracy_score(y_val, prediction)

In [13]:
#여러모델 비교
model1 = LogisticRegression(solver='liblinear', random_state=42)
model2 = GradientBoostingClassifier()
model3 = RandomForestClassifier()
model4 = SGDClassifier()
model5 = SVC()

models = [model1, model2, model3, model4, model5]
i = 0
for model in models:
    i +=1
    print("Model ", i,":", model)
    print("ACC: ", fitAndPredict(model))

Model  1 : LogisticRegression(random_state=42, solver='liblinear')
ACC:  0.8089887640449438
Model  2 : GradientBoostingClassifier()
ACC:  0.8089887640449438
Model  3 : RandomForestClassifier()
ACC:  0.8089887640449438
Model  4 : SGDClassifier()
ACC:  0.398876404494382
Model  5 : SVC()
ACC:  0.6348314606741573


In [14]:
#가장 성능이 좋았던 GradientBoost 모델 매개변수 조절
model = GradientBoostingClassifier(min_samples_split=20, min_samples_leaf=60, max_depth=3, max_features=7)
fitAndPredict(model)

0.6123595505617978