In [477]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from IPython.display import display, HTML
display(HTML("<style>.container { width:100%!important;}</style>"))

sns.set(style='white',font_scale = 1.5)

import warnings
warnings.filterwarnings('ignore')

In [503]:
'''載入資料並'''
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df = df_train.append(df_test)

# Feature Engineering

In [504]:
'''調整名稱欄位'''
df['family_num'] = df['SibSp']+df['Parch']
df['addresser']= df['Name'].str.extract('([A-Za-z]+)\.',expand=True)

'''處理遺失值及重複值'''
print('----------各欄位遺失值------------')
print(df.isnull().sum())

# def age_pick(): #處理年紀遺失值
#     for i in range(df.shape[0]):
#         if df['Survived'][i]==1:
#             return np.random.randint(1,35)
#         elif df['Survived'][i]==0:
#             return np.random.randint(20,75)
# df['Age'] = df['Age'].fillna(age_pick())
# df.dropna(axis=0,inplace=True)

'''計算年紀遺失值'''
df['Title'] = df['addresser']
mapping = {'Dr':'Mr','Rev':'Mr','Mlle':'Miss','Major':'Mr','Col':'Mr','Sir':'Mr','Don':'Mr','Mme':'Miss','Jonkheer':'Mr','Lady':'Mrs','Capt':'Mr','Countess':'Mrs','Ms':'Miss','Dona':'Mrs'}
df.replace({'Title':mapping},inplace=True)
#create an array of titles based on the unique values from the list
#Try two options, mean and median
age_to_impute = df.groupby('Title')['Age'].median()
#age_to_impute = data_df.groupby('Title')['Age'].mean()
for title in df.Title.unique():
    #age_to_impute = data_df.groupby('Title')['Age'].median() #calculates the mean age for each title
#     print(titles.index(title),age_to_impute[titles.index(title)])
    df.loc[(df['Age'].isnull()) & (df['Title'] == title), 'Age'] = age_to_impute[list(df.Title.unique()).index(title)] #assigns the mean age to the missing 

# fill Fare based on Class& Title median
df['Fare'].fillna(df[df.Pclass == 3]['Fare'].median(),inplace=True)

print('----------確認各欄位遺失值------------')
print(df.isnull().sum())

'''------------區分Age欄位---------------'''
df['Age_range'] = pd.qcut(df['Age'],5)
label = LabelEncoder()
df['Age_range_Code'] = label.fit_transform(df['Age_range'])

df['Fare_range'] = pd.qcut(df['Fare'],6)
label = LabelEncoder()
df['Fare_range_Code'] = label.fit_transform(df['Fare_range'])

df.drop(['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked','Title', 'Age','Age_range','Fare','Fare_range'], axis = 1, inplace = True)

print('----------最後確認各欄位遺失值------------')
print(df.isnull().sum())
# df['addresser'][513] = 'Mrs'  #更正擷取錯誤

----------各欄位遺失值------------
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
family_num        0
addresser         0
dtype: int64
----------確認各欄位遺失值------------
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          2
family_num        0
addresser         0
Title             0
dtype: int64
----------最後確認各欄位遺失值------------
Survived           418
Pclass               0
Sex                  0
family_num           0
addresser            0
Age_range_Code       0
Fare_range_Code      0
dtype: int64


In [498]:
df['Fare'].max()

512.3292

In [505]:
df

Unnamed: 0,Survived,Pclass,Sex,family_num,addresser,Age_range_Code,Fare_range_Code
0,0.0,3,male,1,Mr,1,0
1,1.0,1,female,1,Mrs,3,5
2,1.0,3,female,0,Miss,2,1
3,1.0,1,female,1,Mrs,3,4
4,0.0,3,male,0,Mr,3,1
...,...,...,...,...,...,...,...
413,,3,male,0,Mr,0,1
414,,1,female,0,Dona,3,5
415,,3,male,0,Mr,3,0
416,,3,male,0,Mr,0,1


In [None]:
df

In [None]:
'''視覺化檢視'''
fig , ax  = plt.subplots(1,2, figsize=(20,8),dpi=100)
sns.countplot(data=df,
            x='family_num',
            hue='Survived',
             ax = ax[0])

df_title = df[~df.addresser.isin(['Mrs', 'Mr', 'Miss', 'Master'])]
ax1 = sns.countplot(data=df_title,
            x='addresser',
            hue='Survived',
              ax = ax[1])
# ax[1].xaxis.set_tick_params(rotation=45)
ax[1].tick_params(axis='x',labelrotation=90)

for i in range(2):
        ax[i].legend(loc='upper right')

fig.tight_layout(pad=0.5)

In [None]:
'''相關係數視覺化檢視'''
display(df.corr()['Survived'])

plt.figure(figsize = (10,8),dpi=100)
sns.heatmap(df.corr(),annot=True,vmin=-1,center=True)

In [None]:
'''特徵間相互關係視覺化呈現'''
# 刪除相對來說不重要的feature

sns.pairplot(df,
            hue='Survived',
            corner=True)

In [None]:
'''是否生存與年紀和票價的關係視覺化呈現'''
fig , ax  = plt.subplots(1,2, figsize=(20,8),dpi=100)
sns.histplot(data=df,
              x='Fare',
             hue='Survived',
             bins=50,
             kde = True,
           ax = ax[0])

sns.histplot(data=df,
              x='Age',
             hue='Survived',
             bins=50,
             kde = True,
           ax = ax[1])

# 訓練及預測

In [487]:
'''導入必要模組'''
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score,classification_report,mean_squared_error,confusion_matrix,plot_confusion_matrix
from sklearn.tree import plot_tree

In [506]:
'''區分訓練及預測集'''
X = df.drop(['Survived'],axis=1)
y = df['Survived']
X = pd.get_dummies(X,drop_first=True)

X_train_data = X.iloc[:891,:]
X_test_data = X.iloc[891:,:]

y_train_data = y[:891]

X_train, X_test, y_train, y_test = train_test_split(X_train_data,y_train_data, test_size=0.15, random_state=101)

scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

scaled_X_test_final = scaler.transform(X_test_data)

In [507]:
X_train_data

Unnamed: 0,Pclass,family_num,Age_range_Code,Fare_range_Code,Sex_male,addresser_Col,addresser_Countess,addresser_Don,addresser_Dona,addresser_Dr,...,addresser_Major,addresser_Master,addresser_Miss,addresser_Mlle,addresser_Mme,addresser_Mr,addresser_Mrs,addresser_Ms,addresser_Rev,addresser_Sir
0,3,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,3,5,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,0,2,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1,1,3,4,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,3,0,3,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,0,2,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
887,1,0,1,4,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
888,3,3,2,3,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
889,1,0,2,4,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


# SVM(SVC)

In [472]:
# model = SVC(class_weight='balanced')
# #Adjusted Parameters
# param_grid = {'C':[0.1,1,10,100],
#              'kernel':['linear','rbf','poly'],
#               'gamma':['scale','auto', 1,0.1,0.01,0.001],
#               'degree':[2,3]}
# Grid = GridSearchCV(model,
#                     param_grid,
#                     cv=5,
#                    scoring = "accuracy",
#                    verbose=True,
#                    )
# gd.fit(scaled_X_train, y_train)

# print(gd.best_score_)
# print(gd.best_estimator_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
0.8777426377002413
KNeighborsClassifier(leaf_size=1, n_neighbors=18)
0.7777777777777778


# KNN

In [508]:
n_neighbors = [6,7,8,9,10,11,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50]
algorithm  = ['auto']
weights = ['uniform','distance']
leaf_size = list(range(1,50,5)) # 1-50 in the increments of 5
#Define hyperparamenters
hyperparams = {'algorithm': algorithm, 'weights': weights, 'leaf_size': leaf_size, 
              'n_neighbors': n_neighbors}
gd=GridSearchCV(estimator=KNeighborsClassifier(),param_grid=hyperparams,verbose=False,cv=10,scoring="roc_auc")
gd.fit(scaled_X_train, y_train)

print(gd.best_score_)
print(gd.best_estimator_)

0.8711361120492326
KNeighborsClassifier(leaf_size=1, n_neighbors=12)


# LogisticRegression

In [None]:
model = LogisticRegression(max_iter=5000)

param_grid = {'penalty':['l1', 'l2','elasticnet'],
             'C':np.linspace(0,3,15),
             'solver':['newton-cg','lbfgs','liblinear','sag','saga']}
gd=GridSearchCV(estimator=model,param_grid=param_grid,verbose=False,cv=10,scoring="accuracy")
gd.fit(scaled_X_train,y_train)

print(gd.best_score_)
print(gd.best_estimator_)

# Random Forest

In [510]:
random = RandomForestClassifier()
param_grid = {'bootstrap': [True, False],
             'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
             'max_features': ['auto', 'sqrt'],
             'min_samples_leaf': [1, 2, 4],
             'min_samples_split': [2, 5, 10],
             'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
gd=GridSearchCV(estimator=gradient,param_grid=param_grid,verbose=False,cv=10,scoring="accuracy")
gd.fit(scaled_X_train,y_train)

print(gd.best_score_)
print(gd.best_estimator_)

KeyboardInterrupt: 

# gradient

In [None]:
param_grid = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }
gd=GridSearchCV(estimator=gradient,param_grid=param_grid,verbose=False,cv=10,scoring="accuracy")
gd.fit(scaled_X_train,y_train)

print(gd.best_score_)
print(gd.best_estimator_)

# predict result

In [492]:
gd.best_params_

{'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 20, 'weights': 'uniform'}

In [509]:
y_pred = gd.predict(scaled_X_test_final)
temp = pd.DataFrame(pd.read_csv("test.csv")['PassengerId'])
temp['Survived'] = y_pred.astype(int) # Kaggle不接受float
temp.to_csv("submission.csv", index = False)

In [None]:
# submission = pd.DataFrame(test_pred,columns=['Survived'])
# submission
# PassengerId = pd.DataFrame(list(range(892,1310)),columns=['PassengerId'])
# submission = pd.concat([PassengerId,submission],axis=1)

In [None]:
'''儲存資料'''
# from joblib import load,dump
# dump(grid.best_estimator_,'Log_grid.joblib')
# from joblib import load,dump
# model_load = load('../titanic machine learning project/Logistic & values report/Log_grid.joblib')
# test_pred = model_load.predict(df_test)

In [None]:
# df_test[['addresser_Col.',
#  'addresser_Countess.',
#  'addresser_Don.',
#  'addresser_Jonkheer.',
#  'addresser_Lady.',
#  'addresser_Major.',
#  'addresser_Mlle.',
#  'addresser_Mme.',
#  'addresser_Sir.']] = 0