# Kaggle Titanic Solution

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import re
%matplotlib inline

### Import Train/Test Datasets; both are placed in the same folder as this notebook

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### Check first 5 rows of the Train/Test dataset imported

In [None]:
train.head()

In [None]:
test.head()

### Check variables and missing value Train/Test

In [None]:
train.describe() #891 rows in Train

In [None]:
test.describe() #418 rows in Test, test start from 892 onwards 

### Concatenate Train and Test & Check Missing Values

In [None]:
titanic = pd.concat([train,test], sort = False)
titanic.describe()

### Check Missing Values using map

In [None]:
titanic.info()

### Treat Age & Fare

In [None]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].mean())
titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].mean())
titanic.info()

### Treat Cabin

In [None]:
#Check how many missing values are there
#Numpy function is used to get count
np.count_nonzero(titanic['Cabin'].isnull())

In [None]:
#Most cases are missing so we will fill it with 'Missing'
titanic['Cabin'] = titanic['Cabin'].fillna('Missing')
titanic.info()

### Treat Embarked

In [None]:
#Check where did most of the passengers Embarked from
titanic['Embarked'].value_counts()

In [None]:
#Use the idxmax to extra the index of a series who value is maximum
titanic['Embarked'].value_counts().idxmax()

In [None]:
titanic['Embarked'] = titanic['Embarked'].fillna('S')
#Or directly in one line 
#titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].value_counts().idxmax())

In [None]:
#Check again
titanic.info()

In [None]:
titanic.describe()

# Creating Features

In [None]:
titanic['Pclass'] = titanic['Pclass'].astype(str)
titanic.info()

In [None]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [None]:
titanic['Title'] = titanic['Name'].apply(get_title)

In [None]:
titanic['Title'] = titanic['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

In [None]:
titanic.head()

In [None]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1

## Model 1 : Numericals Only

In [None]:
titanic_num = titanic.drop(['PassengerId','Name','Sex','Ticket','Cabin','Embarked'],axis=1)

In [None]:
titanic_num.head()

### Divide Dataset into train and test after cleaning missing values

In [None]:
#Get Train
t_train = titanic_num.iloc[:891,]
t_train.describe()

In [None]:
#Get Test
s_test = titanic_num.iloc[891:,]
s_test.describe()

In [None]:
X = t_train.iloc[:,1:]
X.head()

In [None]:
y = t_train['Survived']
y.head()

In [None]:
import sklearn.model_selection as model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, random_state = 200)

### Logistic Regression

In [None]:
#Import libraries
from sklearn.linear_model import LogisticRegression

In [None]:
#Build model
logreg = LogisticRegression()
logreg.fit( X_train, y_train)
#Ravel is used to convert to a single 1D array | https://www.geeksforgeeks.org/numpy-ravel-python/

In [None]:
logreg.score(X_test,y_test)

In [None]:
t_predictions = logreg.predict(X_test)
t_predictions

## Evaluation on test

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,t_predictions))

## Prediction for Submission test 

In [None]:
t_predictions = logreg.predict(s_test.iloc[:,1:])
t_predictions

In [None]:
#Get Passender ID from Test
PassengerId = test['PassengerId']
PassengerId

In [None]:
# Generate Submission File 
NumSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': t_predictions })
NumSubmission.to_csv("NumSubmission.csv", index=False)