# Kaggle Titanic Solution

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import re
%matplotlib inline

### Import Train/Test Datasets; both are placed in the same folder as this notebook

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### Check first 5 rows of the Train/Test dataset imported

In [None]:
train.head()

In [None]:
test.head()

### Check variables and missing value Train/Test

In [None]:
train.describe() #891 rows in Train

In [None]:
train.info() #891 rows in Train

In [None]:
test.describe() #418 rows in Test, test start from 892 onwards 

In [None]:
test.info() 

### Concatenate Train and Test & Check Missing Values

In [None]:
titanic = pd.concat([train,test], sort = False)
titanic.describe()

### Check Missing Values using map

In [None]:
titanic.info()

### Treat Age & Fare

In [None]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].median())
titanic.info()

In [None]:
#Age
titanic.loc[ titanic['Age'] <= 16, 'Age'] = 0
titanic.loc[(titanic['Age'] > 16) & (titanic['Age'] <= 32), 'Age'] = 1
titanic.loc[(titanic['Age'] > 32) & (titanic['Age'] <= 48), 'Age'] = 2
titanic.loc[(titanic['Age'] > 48) & (titanic['Age'] <= 64), 'Age'] = 3
titanic.loc[ titanic['Age'] > 64, 'Age'] = 4 ;

In [None]:
# Mapping Fare
titanic.loc[ titanic['Fare'] <= 7.91, 'Fare'] = 0
titanic.loc[(titanic['Fare'] > 7.91) & (titanic['Fare'] <= 14.454), 'Fare'] = 1
titanic.loc[(titanic['Fare'] > 14.454) & (titanic['Fare'] <= 31), 'Fare']   = 2
titanic.loc[ titanic['Fare'] > 31, 'Fare'] = 3

### Treat Cabin

In [None]:
#Check how many missing values are there
#Numpy function is used to get count
np.count_nonzero(titanic['Cabin'].isnull())

In [None]:
#Most cases are missing so we will fill it with 'Missing'
titanic['Cabin'] = titanic['Cabin'].fillna('Missing')
titanic.info()

### Treat Embarked

In [None]:
#Check where did most of the passengers Embarked from
titanic['Embarked'].value_counts()

In [None]:
#Use the idxmax to extra the index of a series who value is maximum
titanic['Embarked'].value_counts().idxmax()

In [None]:
titanic['Embarked'] = titanic['Embarked'].fillna('S')
#Or directly in one line 
#titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].value_counts().idxmax())

In [None]:
#Check again
titanic.info()

In [None]:
titanic.describe()

# Creating Features

In [None]:
titanic['Cabin'] = titanic['Cabin'].str[0]
titanic['Cabin'].value_counts()

In [None]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1

titanic['IsAlone'] = 0
titanic.loc[titanic['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [None]:
titanic['Title'] = titanic['Name'].apply(get_title)

In [None]:
titanic.head()

In [None]:
titanic['Title'] = titanic['Title'].replace(['Capt', 'Col','Dr', 'Major', 'Rev'], 'Officer')
titanic['Title'] = titanic['Title'].replace(['Lady', 'Countess','Don', 'Sir', 'Jonkheer', 'Dona'], 'Royalty')

In [None]:
titanic['Title'] = titanic['Title'].replace(['Mlle', 'Ms'], 'Miss')
titanic['Title'] = titanic['Title'].replace('Mme', 'Mrs')

In [None]:
titanic['Title'].value_counts()

In [None]:
def get_ticket(ticket):
    txt = ticket.replace("/","")
    txt = txt.replace(".","")
    txt = txt.upper()
    
    if txt.isdigit():
        return 'xxx'
    else:
        title_search = re.search(r'[A-Z]+[A-Z0-9]+|[A-Z]|[A-Za-z]', txt)
        # If the title exists, extract and return it.
        if title_search:
            return title_search.group(0)
        return ""

In [None]:
titanic['Ticket'] = titanic['Ticket'].apply(get_ticket)

In [None]:
titanic['Ticket'].value_counts()

In [None]:
titanic.head()

## Model 1 : Var for Model

In [None]:
titanic_all = titanic.drop(['PassengerId','Name','SibSp','Parch'],axis=1)

In [None]:
titanic_all.head()

### Divide Dataset into train and test after cleaning missing values

In [None]:
titanic_dummies=pd.get_dummies(titanic_all, drop_first=True)
titanic_dummies.info()

In [None]:
#Get Train
t_train = titanic_dummies[titanic_dummies['Survived'].notnull()]
t_train.describe()

In [None]:
#Get Test
s_test = titanic_dummies[titanic_dummies['Survived'].isnull()]
s_test.describe()

In [None]:
t_train.head()

In [None]:
X = t_train.iloc[:,1:]
X.head()

In [None]:
y = t_train['Survived']
y.head()

In [None]:
import sklearn.model_selection as model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, random_state = 200)

### DT

In [48]:
import sklearn.tree as tree
clf=tree.DecisionTreeClassifier(max_depth=3,random_state=200)

In [None]:
mod=model_selection.GridSearchCV(clf,param_grid={'max_depth':[5,10,15,20,25,30,35,40,45,50,100,200,300,400]})
mod.fit(X_train,y_train)
clf.score(X_test,y_test)

In [None]:
t_predictions = clf.predict(s_test.iloc[:,1:])
#t_predictions

In [None]:
#Get Passender ID from Test
PassengerId = test['PassengerId']
#PassengerId

In [47]:
# Generate Submission File 
NumSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': t_predictions })
NumSubmission.to_csv("NumSubmission_with_all_DT.csv", index=False)

### Logistic Regression

In [None]:
#Import libraries
from sklearn.linear_model import LogisticRegression

In [None]:
#Build model
logreg = LogisticRegression()
logreg.fit( X_train, y_train)
#Ravel is used to convert to a single 1D array | https://www.geeksforgeeks.org/numpy-ravel-python/

In [None]:
logreg.score(X_test,y_test)

In [None]:
t_predictions = logreg.predict(X_test)
t_predictions

## Evaluation on test

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,t_predictions))

## Prediction for Submission test 

In [None]:
t_predictions = logreg.predict(s_test.iloc[:,1:])
t_predictions

In [None]:
#Get Passender ID from Test
PassengerId = test['PassengerId']
PassengerId

In [None]:
# Generate Submission File 
NumSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': t_predictions })
NumSubmission.to_csv("NumSubmission_with_all_Logistic.csv", index=False)

# BaggingClassifier

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf=BaggingClassifier(oob_score=True,n_jobs=-1,n_estimators=20,random_state=400,
                      base_estimator=DecisionTreeClassifier())

In [None]:
clf.fit(X_train,y_train)

In [None]:
clf.oob_score_

In [None]:
clf.score(X_test,y_test)

In [None]:
for m in range(1,5,1):
    for w in range(10,300,20):
        clf=BaggingClassifier(oob_score=True,n_jobs=-1,n_estimators=w,random_state=400,max_features=m
                              base_estimator=DecisionTreeClassifier())
        clf.fit(X_train,y_train)
        oob=clf.oob_score_
        print ('For n_estimators = '+str(w))
        print ('OOB score is '+str(oob))
        print ('************************')

In [None]:
#130
clf=BaggingClassifier(oob_score=True,n_jobs=-1,n_estimators=130,random_state=400,
                      base_estimator=DecisionTreeClassifier())

In [None]:
clf.fit(X_train,y_train)

In [None]:
clf.oob_score_

In [None]:
clf.score(X_test,y_test)

In [None]:
print (clf.estimators_[0].feature_importances_)

In [None]:
# We can extract feature importance from each tree then take a mean for all trees
imp=[]
for i in clf.estimators_:
    imp.append(i.feature_importances_)
imp=np.mean(imp,axis=0)

In [None]:
feature_importance=pd.Series(imp,index=X.columns.tolist())

In [None]:
feature_importance.sort_values(ascending=False)

In [None]:
predictions_clf = clf.predict(s_test.iloc[:,1:])

In [None]:
# Generate Submission File 
NumSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions_clf })
NumSubmission.to_csv("NumSubmission_with_all_Bagging.csv", index=False)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier(n_estimators=80,oob_score=True,n_jobs=-1,random_state=400)

In [None]:
rf.fit(X_train,y_train)

In [None]:
rf.oob_score_

In [None]:
for w in range(10,300,20):
    rf=RandomForestClassifier(n_estimators=w,oob_score=True,n_jobs=-1,random_state=400)
    rf.fit(X_train,y_train)
    oob=rf.oob_score_
    print ('For n_estimators = '+str(w))
    print ('OOB score is '+str(oob))
    print ('************************')

In [None]:
#Finalize 90 trees
rf=RandomForestClassifier(n_estimators=90,oob_score=True,n_jobs=-1,random_state=400)

In [None]:
rf.fit(X_train,y_train)

In [None]:
rf.oob_score_

In [None]:
rf.feature_importances_

In [None]:
imp_feat=pd.Series(rf.feature_importances_,index=X.columns.tolist())

In [None]:
imp_feat.sort_values(ascending=False)

In [None]:
imp_feat[imp_feat>0.01].sort_values(ascending=False).plot(kind='bar')

In [None]:
predictions_rf = rf.predict(s_test.iloc[:,1:])

In [None]:
# Generate Submission File 
NumSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions_clf })
NumSubmission.to_csv("NumSubmission_with_all_RF.csv", index=False)

# RF Grid Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 300, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
#Finalize using best
rf=RandomForestClassifier(n_estimators=203,oob_score=True,n_jobs=-1,random_state=42, )