In [88]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Read data from the provided CSV
Using the provided function from pandas library: read_csv

In [89]:
train_data=pd.read_csv('/kaggle/input/titanic/train.csv')
test_data=pd.read_csv('/kaggle/input/titanic/test.csv')

# An overview of our available data
## Train data and test data

In [90]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [91]:
train_data.shape

(891, 12)

In [92]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Data Cleanup
## Before inserting the data into our machine learning model, we need to fill in all the blanks and normalize all the columns.

## Important function:
- fillna
- mean
- mode
- median
- apply and lambda
- map
- concat
- drop
- get_dummy

In [93]:
def clean_data(df):
    res=df
    res['Age'].fillna(res['Age'].mean(),inplace=True)
    res['Embarked'].fillna(res['Embarked'].mode(),inplace=True)
    res['Fare'].fillna(res['Fare'].mean(),inplace=True)
    res['Ticket']=res['Ticket'].apply(lambda x:get_head_ticket(x))
    
    res['Age']=res['Age']/100
    res['SibSp']=res['SibSp']/11
    res['Parch']=res['Parch']/7
    res['Fare']+=1
    res['Fare']=np.log(res['Fare'])
    #res['Pclass']=(res['Pclass']-1)/2
    res['Pclass']=res['Pclass'].map({1:'First',2:'Second',3:'Third'})
    
    addition=pd.get_dummies(df[['Embarked','Sex','Ticket','Pclass']])
    res=pd.concat([df,addition],axis=1)
    
    #res['Sex']=res['Sex'].map({'female':1,'male':0})
    
    res=res.drop(columns=['Embarked','PassengerId','Name','Cabin','Ticket','Sex','Pclass'])
    
    
    return res

def get_head_ticket(str):
    if(len(str.split())==2):
        return str.split()[0]
    else:
        return ''

In [94]:
cleaned_train=clean_data(train_data)
cleaned_test=clean_data(test_data)

# Viewing our 'cleaned' train and test data
## There will be many advanced data cleaning process in the our upcoming seminars so stay tuned :))

In [95]:
#cleaned_train.head()
#cleaned_train.shape
cleaned_train.isnull().sum()

Survived             0
Age                  0
SibSp                0
Parch                0
Fare                 0
Embarked_C           0
Embarked_Q           0
Embarked_S           0
Sex_female           0
Sex_male             0
Ticket_              0
Ticket_A./5.         0
Ticket_A.5.          0
Ticket_A/4           0
Ticket_A/4.          0
Ticket_A/5           0
Ticket_A/5.          0
Ticket_A/S           0
Ticket_A4.           0
Ticket_C             0
Ticket_C.A.          0
Ticket_C.A./SOTON    0
Ticket_CA            0
Ticket_CA.           0
Ticket_F.C.          0
Ticket_F.C.C.        0
Ticket_Fa            0
Ticket_P/PP          0
Ticket_PC            0
Ticket_PP            0
Ticket_S.C./A.4.     0
Ticket_S.C./PARIS    0
Ticket_S.O./P.P.     0
Ticket_S.O.C.        0
Ticket_S.O.P.        0
Ticket_S.P.          0
Ticket_S.W./PP       0
Ticket_SC            0
Ticket_SC/AH         0
Ticket_SC/PARIS      0
Ticket_SC/Paris      0
Ticket_SCO/W         0
Ticket_SO/C          0
Ticket_SOTO

## Since there may be people in the test data that have tickets type that doesn't appear in the training data and vice versa. Therefore, we need to ensure that both dataframes have the same set of columns and in the same order.

In [96]:
for cols in cleaned_test.columns:
    if cols not in cleaned_train.columns:
        cleaned_train[cols]=[0 for j in range(cleaned_train.shape[0])]

for cols in cleaned_train.columns:
    if cols not in cleaned_test.columns and cols!='Survived':
        cleaned_test[cols]=[0 for j in range(cleaned_test.shape[0])]
        
cleaned_test = cleaned_test.reindex(cleaned_train.drop(columns='Survived').columns, axis=1)

# Create our data points and sample outputs for each data points
## X: Input Data Points
## y: Sample Outputs

In [97]:
X=cleaned_train.drop(columns='Survived')
y=cleaned_train['Survived']
X.shape,y.shape

((891, 60), (891,))

# Create out train datasets and testing datasets
## Why do we need to split between training datasets and validation datasets
sklearn is a valuable library for Shallow Machine Learning with many ML models and utility functions.
Example: train_test_split

In [98]:
from sklearn.model_selection import train_test_split

X_train,X_val,y_train,y_val=train_test_split(X,y,random_state=10,test_size=0.2)

# Using available Machine Learning Model from sklearn
## Logistic Regression

A result is generated based on a the probability value created by a logistic function. The Logistic Regression find the function by fitting a function with random variables to our existing examples.

Too many iterations caused the model to overfit but too few can cause underfitting.
![Logistic Regression in 3D](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ7Z_32JSBrqWFHe2QJSFspAPh_L-MxKLZtFJmjp9XdYdGr5fYmsz-i5nK6bySPIpq9CiQ&usqp=CAU)

![Fitting](https://miro.medium.com/max/1400/1*XRCJt-5yNXDfzrVbEbh4DA.gif)

In [102]:
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression(max_iter=10,random_state=54)
logreg_model.fit(X_train,y_train)
print(logreg_model.score(X_val,y_val))

0.8212290502793296


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## KNN - K-Nearest-Neighbors
Choosing the result of a new data points based on its k nearset neighbous, hence the name :D
![Choosing its label based on its neighbours](https://www.newtechdojo.com/wp-content/uploads/2020/06/KNN-1.gif)

In [105]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=3,weights='distance')
#knn_model = KNeighborsClassifier(n_neighbors=3,weights='uniform')
knn_model.fit(X_train,y_train)
knn_model.score(X_val,y_val)

0.8156424581005587

## Decision Tree
Splitting the data space into many areas based on conditions of the data. 
Example: 
- Is age over 50? 
- Is the passenger in First Class?

![Decision Tree in Action](https://miro.medium.com/max/1200/1*Bg0_UGT6xo89Ij-GX61yPg.png)

In [115]:
from sklearn.tree import DecisionTreeClassifier

dt=DecisionTreeClassifier(criterion='gini',splitter='random',max_depth=5,random_state=3)
dt.fit(X_train,y_train)
dt.score(X_val,y_val)

0.8156424581005587

## Random Forest
Many Decision Trees are combined with many types of randomness. In this case it is choosing a random subset of features in each split.
![RF](https://static.javatpoint.com/tutorial/machine-learning/images/random-forest-algorithm2.png)

In [116]:
from sklearn.ensemble import RandomForestClassifier
rfModel = RandomForestClassifier(n_estimators=150, max_depth=6, random_state=17)
rfModel.fit(X_train,y_train)
rfModel.score(X_val,y_val)

0.8379888268156425

In [117]:
predictions=rfModel.predict(cleaned_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
