In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder


In [3]:
titanic_df = pd.read_csv("File\\titanic_train.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
def fillna_features(df):
    
    df["Age"].fillna(df["Age"].mean(), inplace=True)
    df["Cabin"].fillna("N", inplace=True)
    df["Embarked"].fillna("N", inplace=True)
        
    return df 

def drop_features(df):
    df.drop(["PassengerId", "Name", "Ticket"], axis=1, inplace=True)
    return df 

def encoding_featrue(df):
    df["Cabin"] = df["Cabin"].str[:1]
    features = ["Cabin", "Embarked", "Sex"]
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df 

def preprocessing_feature(df):
    df = fillna_features(df)
    df = drop_features(df)
    df = encoding_featrue(df)
    
    return df 

In [7]:
titanic_df = pd.read_csv("File\\titanic_train.csv")
y_titanic_df = titanic_df["Survived"]
X_titanic_df = titanic_df.drop("Survived", axis=1, inplace=False)
X_titanic_df = preprocessing_feature(X_titanic_df)

In [9]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

print(f"X_train shape : {X_train.shape}")
print(f"X_test shape : {X_test.shape}")
print(f"y_train shape : {y_train.shape}")
print(f"y_test shape : {y_test.shape}")

X_train shape : (712, 8)
X_test shape : (179, 8)
y_train shape : (712,)
y_test shape : (179,)


In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression(solver="liblinear")

# DecsionTreeClassifier training / prediction 
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print("DecisionTreeClassfier accuracy : {0: 4f}".format(accuracy_score(y_test, dt_pred)))

# RandomForestClassifier training / prediction
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print("RandomForestClassifier accuracy : {0: 4f}".format(accuracy_score(y_test, rf_pred)))

# LogisticRegression training / prediction
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print("LogisticRegression accuracy : {0: 4f}".format(accuracy_score(y_test, lr_pred)))


DecisionTreeClassfier accuracy :  0.787709
RandomForestClassifier accuracy :  0.854749
LogisticRegression accuracy :  0.865922


In [20]:
from sklearn.model_selection import KFold

def exec_kfold(clf, folds=5):
    kfold =KFold(n_splits=folds)
    scores = []
    
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
        X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
        
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
        print(f"#{iter_count}# accuracy of cross validation : {accuracy}")
    
    mean_score = np.mean(scores)
    print(f"mean score {mean_score}")
    
exec_kfold(dt_clf, folds=5)
    
        

#0# accuracy of cross validation : 0.7541899441340782
#1# accuracy of cross validation : 0.7808988764044944
#2# accuracy of cross validation : 0.7865168539325843
#3# accuracy of cross validation : 0.7696629213483146
#4# accuracy of cross validation : 0.8202247191011236
mean score 0.782298662984119


In [24]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt_clf, X_titanic_df, y_titanic_df, cv=5)

for iter_count, accuracy in enumerate(scores):
    print(f"#{iter_count}# accuracy of cross validation : {accuracy}")

print(f"mean score {np.mean(scores)}")
    

#0# accuracy of cross validation : 0.7430167597765364
#1# accuracy of cross validation : 0.7752808988764045
#2# accuracy of cross validation : 0.7921348314606742
#3# accuracy of cross validation : 0.7865168539325843
#4# accuracy of cross validation : 0.8426966292134831
mean score 0.7879291946519366


In [30]:
from sklearn.model_selection import GridSearchCV 

parameters =  {"max_depth" : [2,3,5,10], 
               "min_samples_split":[2,3,5],
               "min_samples_leaf": [1,5,8]}

grid_dclf = GridSearchCV(dt_clf, param_grid=parameters, scoring="accuracy", cv=5)
grid_dclf.fit(X_train, y_train)

print(f"GridSearchCV Optimal hyperparameters : {grid_dclf.best_params_}")
print(f"GridSearchCV Optimal accuracy : {grid_dclf.best_score_}")
best_dclf = grid_dclf.best_estimator_


dpredictions = best_dclf.predict(X_test)
accuracy = accuracy_score(y_test, dpredictions)
print(f"test dataset DecisionTreeClassifier accuracy : {accuracy}")



GridSearchCV Optimal hyperparameters : {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}
GridSearchCV Optimal accuracy : 0.7991825076332119
test dataset DecisionTreeClassifier accuracy : 0.8715083798882681
