In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


First we load the train data we will use to train our model

In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Second we load the test data which  contains a list of passengers that our model will try to predict
who of them survived

We can notice that there are some missing values in the Age column

In [3]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


The first preprocessing we will be conducting is handling the missing values in the Age column.
Instead of using imputation we will just replace each Nan value with value "-1". 
Putting such an odd value as "-1" or "0" will let our model notice that there is something wrong with these data. 
We will do it for both the train and test data sets


In [4]:
train_data.Age=train_data["Age"].fillna(-1)
test_data.Age=test_data["Age"].fillna(-1)
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Now another feature that will be helpful is the Sex feature. 
The problem is that our model only understand numeric values.

To address this issue we should convert the text into a corresponding numeric value, that's where LabelEncoder is used for.
"Sex" column hold categorial values (male and female), so what does LabelEncoder do is that it assigns for each unique element a unique number.  

From the output we can notice that each male is represented as a "1" and female as "0"

In [5]:
from sklearn.preprocessing import LabelEncoder


labeled_train_data= train_data.copy()
labeled_test_data=test_data.copy()

label_encoder=LabelEncoder()
labeled_train_data["Sex"]=pd.DataFrame(label_encoder.fit_transform(train_data["Sex"]))
labeled_test_data["Sex"]=pd.DataFrame(label_encoder.fit_transform(test_data["Sex"]))
labeled_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


Now its time to find the best modeling which will give the most accurate result.
We will be conducting our test first on the train data only.

Firstly we need to drop all columns with any values other than numeric.

Secondly, we specify our desired output which in our case is the Surviving status of the passengers.

Thirdly, we specify the features that we which to train our model with, in other words, the basis on which our model will
make its prediction through identifying common patterns in these features.

Finally we need to split the data so that the train data is not the same as the data our model will predict 
Thus separating the data into trainning data and validation data. 


In [29]:
from sklearn.model_selection import train_test_split

train_data_raw= labeled_train_data.select_dtypes(exclude=['object'])
y=train_data_raw.Survived
columns=["Pclass","Age", "Parch","Sex"]
x= train_data_raw[columns]
xtrain, xvalid, ytrain, yvalid = train_test_split (x, y, random_state = 0)


We will use the precision_score metric as a method of evaluation for the precision of our model.

So I created a function which returns the precision percentage of the selected model and the model which will give the highest precision percentage will be our chosen model.  

In [36]:
from sklearn.metrics import precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


def get_precision(maxnum,xtrain,xvalid,ytrain,yvalid):
#     model = RandomForestClassifier(n_estimators=maxnum, max_leaf_nodes=maxnum, random_state=1)
    model = DecisionTreeClassifier(max_leaf_nodes=maxnum, max_depth=maxnum, random_state=1)
#     model = LogisticRegression()
    model.fit(xtrain,ytrain)
    predict= model.predict(xvalid)
    precision= precision_score(yvalid, predict)
    return precision 

In [37]:
for maxnum in range(50,1000,50):
    corresponding_precision=get_precision(maxnum,xtrain,xvalid,ytrain,yvalid)
    print(f' Maxnum= {maxnum}   Precision Percentage: {corresponding_precision}')

 Maxnum= 50   Precision Percentage: 0.7922077922077922
 Maxnum= 100   Precision Percentage: 0.8
 Maxnum= 150   Precision Percentage: 0.7567567567567568
 Maxnum= 200   Precision Percentage: 0.7567567567567568
 Maxnum= 250   Precision Percentage: 0.7567567567567568
 Maxnum= 300   Precision Percentage: 0.7567567567567568
 Maxnum= 350   Precision Percentage: 0.7567567567567568
 Maxnum= 400   Precision Percentage: 0.7567567567567568
 Maxnum= 450   Precision Percentage: 0.7567567567567568
 Maxnum= 500   Precision Percentage: 0.7567567567567568
 Maxnum= 550   Precision Percentage: 0.7567567567567568
 Maxnum= 600   Precision Percentage: 0.7567567567567568
 Maxnum= 650   Precision Percentage: 0.7567567567567568
 Maxnum= 700   Precision Percentage: 0.7567567567567568
 Maxnum= 750   Precision Percentage: 0.7567567567567568
 Maxnum= 800   Precision Percentage: 0.7567567567567568
 Maxnum= 850   Precision Percentage: 0.7567567567567568
 Maxnum= 900   Precision Percentage: 0.7567567567567568
 Maxnum=

Here are the results:

Highest percentage achieved by RandomForestClassifier was: 78.5%

Highest percentage achieved by DecisionTreeClassifier was: 80%

Highest percentage achieved by LogisticRegression was: 72%

So we can see that the best model would be the DecisionTreeClassifier because it has the highest precision percentage


So now to our final step, we will work on the test data this time and get our desired results, save it and submit it for the competition 

In [38]:

y=labeled_train_data.Survived

features = ["Pclass", "Age", "Parch","Sex"]
X =labeled_train_data[features]
X_test = labeled_test_data[features]

model = DecisionTreeClassifier(max_leaf_nodes=100, max_depth=100, random_state=1)

model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.head(20)
output.to_csv('My_Model.csv', index=False)