In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler,StandardScaler
from sklearn.model_selection import KFold, cross_val_score, ShuffleSplit,cross_validate
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier,ExtraTreesClassifier,BaggingClassifier,VotingClassifier
from sklearn.naive_bayes import GaussianNB 

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_test_PassengerId = df_test['PassengerId']

df = pd.concat([df_train,df_test],ignore_index=True)
df.drop(['PassengerId','Cabin','Ticket'],inplace = True,axis = 1)
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

Fill NaN

In [None]:
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace = True)
df['Fare'].fillna(df[df['Pclass'] == 3]['Fare'].mean(),inplace = True)
df['Age'].fillna(df['Age'].median(),inplace = True)

Feature Engineering

In [None]:
df['FamilySize'] = df['Parch'] + df['SibSp'] + 1

df['FamilySize_Cat'] = 'Alone'
df[(df['FamilySize']>=2) & (df['FamilySize']<=4)]['FamilySize_Cat'] = 'Small' 
df[(df['FamilySize']>=5) & (df['FamilySize']<=6)]['FamilySize_Cat'] = 'Medium' 
df[df['FamilySize']>=7]['FamilySize_Cat'] = 'Large' 

df['FareBin'] = pd.qcut(df['Fare'], 4)
df['AgeBin'] = pd.cut(df['Age'], 5)

df['Title'] = df['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
title_count = df['Title'].value_counts()
df['Title'] = df['Title'].apply(lambda x: 'Misc' if title_count[x]<10 else x)

In [None]:
df.drop('Name',axis = 1,inplace = True)

In [None]:
df.head()

Encoding

In [None]:
dummy_col = ['Embarked','Sex','FamilySize_Cat','FareBin','AgeBin','Title']

for feature in dummy_col:
    df_ = pd.get_dummies(df[feature],prefix = feature,drop_first=True)
    df.drop(feature,inplace = True,axis = 1)
    df = pd.concat([df,df_],axis = 1)

In [None]:
df.head()

In [None]:
n_train = len(df_train)
corrmat = df.iloc[:n_train,:].corr()
top_corr_features = corrmat.index[abs(corrmat["Survived"])>0.1]
plt.figure(figsize=(10,10))
g = sns.heatmap(df.iloc[:n_train,:][top_corr_features].corr(),annot=True,cmap="RdYlGn")

Continious Features

In [None]:
continuous_features = [feature for feature in df.columns if len(df[feature].unique())>10 and df[feature].dtype != 'object']
continuous_features,len(continuous_features)

In [None]:
sk = df[continuous_features].apply(lambda x:skew(x)).sort_values(ascending = False)
sk = pd.DataFrame(sk)
sk

In [None]:
ch = [0,0.03,0.05,0.08,0.1,0.13,0.15]
df__ = pd.DataFrame()
for choice in ch:
    df_ = pd.DataFrame(skew(boxcox1p(df[continuous_features],choice)),columns=[choice],index = continuous_features)
    df__ = pd.concat([df__,df_],axis = 1)
    
df__ = pd.concat([pd.DataFrame(skew(df[continuous_features]),columns = ['Org'],index = continuous_features),df__],axis = 1)


skew_result = {}
for i in df__.index:
    min_ = 'Org'
    for j in df__.columns:
        if df__.loc[i,j]>=0 and df__.loc[i,j]<df__.loc[i,min_]:
            min_ = j
            
    skew_result[i] = min_
    

print(skew_result)
skew_result = {k:v for k,v in skew_result.items() if v != 'Org'}

In [None]:
for k,v in skew_result.items():
    df[k] = boxcox1p(df[k],v)

In [None]:
df_train = df.iloc[:n_train,:]
df_test = df.iloc[n_train:,:]

x = df_train.drop('Survived',axis = 1)
y = df_train['Survived']
df_test.drop('Survived',axis = 1,inplace = True)

In [None]:
#Validation
ss = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 )

def acc(model):
    cvs = (cross_validate(model,x.values,y.values,cv = ss,return_train_score = True))
    return cvs['train_score'].mean(),cvs['test_score'].mean()

In [None]:
cf1 = XGBClassifier(learning_rate= 0.01, max_depth= 4, n_estimators= 300, seed= 0)
cf2 = AdaBoostClassifier(learning_rate= 0.1, n_estimators= 300, random_state= 0)
cf3 = GradientBoostingClassifier(learning_rate= 0.05, max_depth= 2, n_estimators= 300,random_state= 0)
cf4 = RandomForestClassifier(criterion= 'entropy', max_depth= 6, n_estimators= 100, oob_score= True, random_state= 0)
cf5 = ExtraTreesClassifier(criterion= 'entropy', max_depth= 6, n_estimators= 100, random_state= 0)
cf6 = BaggingClassifier(max_samples= 0.25, n_estimators= 300, random_state= 0)
cf7 = GaussianNB()

clf = [('xgb',cf1),('ada',cf2),('gbm',cf3),('rf',cf4),('et',cf5),('bbc',cf6),('gnb',cf7)]

In [None]:
ens_hard = VotingClassifier(estimators=clf,voting='hard')
ens_soft = VotingClassifier(estimators=clf,voting='soft')
ens = VotingClassifier(estimators = [('ensh',ens_hard),('enss',ens_soft)],voting = 'hard')

ens_hard.fit(x.values,y.values)
ens_soft.fit(x.values,y.values)
ens.fit(x.values,y.values)

In [None]:
df_submit = pd.DataFrame()
df_submit['PassengerId'] = df_test_PassengerId
df_submit['Survived'] = ens.predict(df_test.values).astype(np.int32)

df_submit = df_submit[['PassengerId','Survived']]
# df_submit.to_csv('submit.csv',index=False)