In [174]:
"""
작성자 : 이민우
작성일 : 2018 11 12
프로그램 : 타이타닉 생존 예측프로그램
"""
import os
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
TITANIC_PATH = os.path.join("datasets","titanic")


import warnings
warnings.filterwarnings('ignore')

def load_titanic_data(filename, titanic_path = TITANIC_PATH): # file load
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

        
class DataFrameSelector(BaseEstimator, TransformerMixin): # np 배열로 반환하기 위해 
        def __init__(self, attribute_names):
            self.attribute_names = attribute_names
        def fit(self, X, y = None):
            return self
        def transform(self, X):
            return X[self.attribute_names]
        
class MostFrequentImputer(BaseEstimator, TransformerMixin):# 범주형 특성의 대한 변환기
    def fit(self, X, y = None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],index=X.columns) # 가장 빈번하게 발생한 걸로 대치
        return self
    def transform(self, X, y= None):
        return X.fillna(self.most_frequent_)

class Encoder(BaseEstimator, TransformerMixin): # 임의로 만든 Encoder 
    def fit(self,X, y = None):    
        return self
    def transform(self, X, y= None): # 이름의 특성에 좋은 내용이 담긴 걸 보고 분리해서 따로 특성을 추가해줌 
        temp = list()
        for st in X['Name']:
            temp.append(st.split(' ')[1])
        te = list()
        
        for a in temp:
            if a == 'Mr.':
                te.append(0) # Mr 일경우 0
            elif a =='Miss.':
                te.append(1) # Miss 일경우 1
            elif a == 'Mrs.': 
                te.append(2) # Mrs 일 경우 2
            else:
                te.append(3) # 그 외의 경우는 3
        self.te = te
     
        X = X.drop('Name', axis = 1) # 변환 해 준 후 쓸모 없는 Name 특성은 제거 
        return np.c_[X,self.te]
    
    
if __name__ == "__main__":
    train_data = load_titanic_data("train.csv")
    test_data  = load_titanic_data("test.csv")
    y_test = load_titanic_data("gender_submission.csv")
    
    print(train_data.head())
    print(train_data.info())
    print(train_data.describe())
    print(train_data["Survived"].value_counts())
    
    corr = train_data.corr() # 상관계수 
    print(corr['Survived'].sort_values(ascending = False))
    
    print(train_data["Name"]) # 이름 내용 
    
    # 변환 파이프라인
    num_pipeline = Pipeline([
        ('select_numeric',DataFrameSelector(['Age','SibSp','Parch','Fare'])),
        ('imputer',SimpleImputer(strategy = 'median')),
        ('std_scaler',StandardScaler())
    ])
    cat_pipeline = Pipeline([
        ('select_cat',DataFrameSelector(['Pclass','Sex',"Embarked","Name"])),
        ('imputer',MostFrequentImputer()),
        ('Encoder',Encoder()),
        ('cat_encoder',OneHotEncoder(sparse = False)),
    ])
    
    preprocess_pipeline = FeatureUnion(transformer_list=[
        ('num_pipeline',num_pipeline),
        ('cat_pipeline',cat_pipeline),        
    ])
    

    
    y_train = train_data['Survived'].copy() # 정답 레이블 분리
    train_data = train_data.drop('Survived',axis = 1)
    X_train = preprocess_pipeline.fit_transform(train_data) # 데이터 전처리
  

    from sklearn.linear_model import  SGDClassifier # 확률적 경사 하강법
    
    sgd_clf = SGDClassifier(max_iter=5,random_state=42) 
    sgd_clf.fit(X_train, y_train)
    print("sgd_clf mean : ",cross_val_score(sgd_clf,X_train,y_train,cv=10).mean()) 
    
    
    svm_clf = SVC(gamma = 'auto') # 서포터백터머신
    svm_clf.fit(X_train, y_train)
    
    
    svm_scores = cross_val_score(svm_clf, X_train, y_train, cv= 10)
    print("SVM score mean : ",svm_scores.mean()) # cross_val_score 의 평균값 
    
    
    forest_clf = RandomForestClassifier(n_estimators = 10, random_state = 42) # 랜덤분류기
    forest_clf.fit(X_train, y_train)
    
    forest_scores = cross_val_score(forest_clf, X_train, y_train, cv = 10)
    print("forest mean: ",forest_scores.mean()) # 랜덤분류기의 평균값
    
    
    
    # 그리드 탐색
    # 최적의 하이퍼파라미터를 찾기 위하여 실행
    # 구글을 참고하였습니다.
    
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-5],
                     'C': [0.001,0.1, 10, 50,1000]},
                    {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-5],
                     'C': [0.001,0.1, 10, 50,1000]},
                    {'kernel': ['linear'], 'C': [0.001,0.1, 10, 50,1000]}
                   ]
    scores = ['precision', 'recall']


    for score in scores: # precision 과 recall 의 경우 모두 확인
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                           scoring='%s_macro' % score) #하나씩 그리드탐색 생성
        clf.fit(X_train, y_train) # 학습

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score'] # 그 때의 평균 점수 
        stds = clf.cv_results_['std_test_score'] # 표준편차 
        for mean, std, params in zip(means, stds, clf.cv_results_['params']): # 평균 , 표준편차 , 파라미터 출력 
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()


    
    print(clf.best_params_) # 최적의 파라미터 
    
    final_model = SVC(C = 50, gamma = 0.01, kernel = 'rbf') # 최적의 파라미터로 만든 SVC
    final_model.fit(X_train, y_train) # 마지막 모델 학습 
    
    
    X_test = preprocess_pipeline.transform(test_data)  # 테스트 셋 전처리 
    y_pred = final_model.predict(X_test) # 테스트 셋 예측 값 
    
    
    y_test = y_test["Survived"].values # 정답 레이블 
    
    final_model_scores = cross_val_score(final_model, X_test, y_test, cv= 10) # 최종 모델 교차검증 
    print("final_model score mean : ",final_model_scores.mean()) # 교차검증 평균 점수 
    
    print("final_model_scores : : ",(y_test == y_pred).sum()/len(y_test))
    
    #y_pred = forest_clf.predict(X_test)
   # print("forest_scores ",((y_test == y_pred).sum()/len(y_test)))
   

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

Best parameters set found on development set:

{'C': 50, 'gamma': 0.01, 'kernel': 'rbf'}

Grid scores on development set:

0.500 (+/-0.000) for {'C': 0.001, 'gamma': 0.01, 'kernel': 'rbf'}
0.500 (+/-0.000) for {'C': 0.001, 'gamma': 0.001, 'kernel': 'rbf'}
0.500 (+/-0.000) for {'C': 0.001, 'gamma': 1e-05, 'kernel': 'rbf'}
0.601 (+/-0.070) for {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
0.500 (+/-0.000) for {'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}
0.500 (+/-0.000) for {'C': 0.1, 'gamma': 1e-05, 'kernel': 'rbf'}
0.809 (+/-0.052) for {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.768 (+/-0.050) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.500 (+/-0.000) for {'C': 10, 'gamma': 1e-05, 'kernel': 'rbf'}
0.812 (+/-0.066) for {'C': 50, 'gamma': 0.01, 'kernel': 'rbf'}
0.771 (+/-0.050) for {'C': 50, 'gamma': 0.001, 'kernel': 'rbf'}
0.506 (+/-0.006) for {'C': 50, 'gamma': 1e-05, 'kernel': 'rbf'}
0.791 (+/-0.071) for {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
0.808 (+/-0.050) for {'C': 1000, 'ga

In [19]:
train_copy = train_data.copy()
train_copy["people"] = train_copy['SibSp']+ train_copy['Parch']
corr = train_copy.corr()
print(corr['Survived'].sort_values(ascending = False))

Survived       1.000000
Fare           0.257307
Parch          0.081629
people         0.016639
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64


In [78]:
#print(train_copy['Name'])
train_copy = train_data.copy()
temp = list()
for st in train_copy['Name']:
    temp.append(st.split(' ')[1])
te = list()
train_copy['qq'] = temp
for a in train_copy['qq']:
    if a == 'Mr.':
        te.append(0)
    elif a =='Miss.':
        te.append(1)
    elif a == 'Mrs.':
        te.append(2)
    else:
        te.append(3)

train_copy['qqq'] = te
train_copy['Cabin'] = train_copy['Cabin'].fillna('N')
train_copy['Cabin'] = train_copy['Cabin'].apply(lambda x : x[0])

from sklearn.preprocessing import LabelBinarizer

en = LabelBinarizer()

cc = en.fit_transform(train_copy['Cabin'])
#print(cc)   
train_copy['Cabin'] = cc
#print(train_copy['qq'])
corr = train_copy.corr()
me = train_copy['Age'].median()
train_copy['Age'] = train_copy['Age'].fillna(me)
print(corr['Survived'].sort_values(ascending = False))



28.0
Survived       1.000000
qqq            0.396799
Fare           0.257307
Parch          0.081629
Cabin          0.022287
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64


In [126]:
"""
작성자 : 이민우
작성일 : 2018 11 12
프로그램 : 타이타닉 생존 예측프로그램
"""
import os
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
TITANIC_PATH = os.path.join("datasets","titanic")


import warnings
warnings.filterwarnings('ignore')

def load_titanic_data(filename, titanic_path = TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

        
class DataFrameSelector(BaseEstimator, TransformerMixin):
        def __init__(self, attribute_names):
            self.attribute_names = attribute_names
        def fit(self, X, y = None):
            return self
        def transform(self, X):
            return X[self.attribute_names]
        
class MostFrequentImputer(BaseEstimator, TransformerMixin):# 범주형 특성의 대한 변환기
    def fit(self, X, y = None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],index=X.columns)
        return self
    def transform(self, X, y= None):
        return X.fillna(self.most_frequent_)

class Encoder(BaseEstimator, TransformerMixin):
    def fit(self,X, y = None):
        temp = list()
        for st in X['Name']:
            temp.append(st.split(' ')[1])
        te = list()
        
        for a in temp:
            if a == 'Mr.':
                te.append(0)
            elif a =='Miss.':
                te.append(1)
            elif a == 'Mrs.':
                te.append(2)
            else:
                te.append(3)
        self.te = te
        return self
    def transform(self, X, y= None):
     
        X = X.drop('Name', axis = 1)
        return np.c_[X,self.te]
    
    
if __name__ == "__main__":
    train_data = load_titanic_data("train.csv")
    test_data  = load_titanic_data("test.csv")
    y_test = load_titanic_data("gender_submission.csv")
    
   #print(train_data.head())
    #print(train_data.info())
    #print(train_data.describe())
    #print(train_data["Survived"].value_counts())
    
    #train_data['Cabin'] = train_data['Cabin'].apply(lambda x : x[0])
   # print(train_data['Cabin'])
    #print(train_data['Cabin'].value_counts())
    
    
    
   #print(train_copy['Name'])
  

 
    num_pipeline = Pipeline([
        ('select_numeric',DataFrameSelector(['Age','SibSp','Parch','Fare'])),
        ('imputer',SimpleImputer(strategy = 'median')),
        ('std_scaler',StandardScaler())
    ])
    cat_pipeline = Pipeline([
        ('select_cat',DataFrameSelector(['Pclass','Sex',"Embarked","Name"])),
        ('imputer',MostFrequentImputer()),
        ('Encoder',Encoder()),
        ('cat_encoder',OneHotEncoder(sparse = False)),
    ])
    
    preprocess_pipeline = FeatureUnion(transformer_list=[
        ('num_pipeline',num_pipeline),
        ('cat_pipeline',cat_pipeline),        
    ])
    
    
    
    

    X_train = preprocess_pipeline.fit_transform(train_data)
   
    y_train = train_data['Survived']
  
    from sklearn.linear_model import  SGDClassifier
    sgd_clf = SGDClassifier(max_iter=5,random_state=42) 
    sgd_clf.fit(X_train, y_train)
    print("확률적 경사 하강법 : ",cross_val_score(sgd_clf,X_train,y_train,cv=10).mean())
    
    svm_clf = SVC(gamma = 'auto')
    svm_clf.fit(X_train, y_train)
    
    
    svm_scores = cross_val_score(svm_clf, X_train, y_train, cv= 10)
    print("SVM score mean : ",svm_scores.mean())
    

    """
    Invalid parameter max_features for estimator SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

    """
    
    
    
    param_grid = {"gamma": np.logspace(-6, -1, 10)}
    grid_search = GridSearchCV(estimator=svm_clf, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=2)
    
    grid_search.fit(X_train, y_train)
    
    final_model = grid_search.best_estimator_
    
    
    forest_clf = RandomForestClassifier(n_estimators = 10, random_state = 42)
    
    
    param_grid = {
        'max_depth' : [6, 8, 10, 15],
        'n_estimators': [1200],
        'max_features': ['sqrt'],
        'min_samples_split': [2, 7, 15, 30],
        'min_samples_leaf': [1, 15, 30, 60],
        'bootstrap': [True],
    }
    grid_search = GridSearchCV(forest_clf, scoring='accuracy', param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    
    final_model = grid_search.best_estimator_
    
    
    forest_clf.fit(X_train, y_train)
    forest_scores = cross_val_score(forest_clf, X_train, y_train, cv = 10)
    
    print("forest mean: ",forest_scores.mean())
    
    y_pred = forest_clf.predict(X_test)
    y_test = y_test["Survived"].values
    
    print("forest ",(y_test == y_pred).sum()/len(y_test))
    
    X_test = preprocess_pipeline.transform(test_data)
    y_pred = final_model.predict(X_test)
    
    
    print("Final Model: ",(y_test == y_pred).sum()/len(y_test))
    
    

0.7463480308705026
0.8260637271592328
0.9545454545454546
0.8115690614005221
0.8157894736842105
