# Titanic passenger survival data set with decision tree
The source code is a simple implementation for titanic passenger survival data set with the python programming language and decision tree. Also, some primary preprocessing steps is considered as well. 

### Note
This code was written as my Machine Learning course's assignment in the fall of 2021, so it may have some bugs to fix or be coded more optimally.

### GitHub
https://github.com/MohsenEbadpour

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from math import sqrt
from sklearn.model_selection import train_test_split
import seaborn as sn
import math
from sklearn.impute import SimpleImputer
from sklearn import tree
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")

train_data.sample(frac=1).reset_index(drop=True) 
test_data.sample(frac=1).reset_index(drop=True) 

columns_train = ["PassengerId","Survived","Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]
columns_test = ["PassengerId","Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]

basic_feature_names = [ "PassengerId","Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]

print("List of Missed Values in Train data",train_data.isna().sum(),sep='\n')   


List of Missed Values in Train data
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [2]:
print("List of Missed Values in Test data",test_data.isna().sum(),sep='\n')   


List of Missed Values in Test data
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [3]:
train_data["Age"] = train_data["Age"].astype(float)
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].mean())

imp_train = SimpleImputer(strategy="most_frequent")
train_data = pd.DataFrame(data=imp_train.fit_transform(train_data),columns=columns_train)

print("List of Missed Values in Train data",train_data.isna().sum(),sep='\n')  

List of Missed Values in Train data
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [4]:
test_data["Age"] = test_data["Age"].astype(float)
test_data["Age"] = test_data["Age"].fillna(test_data["Age"].mean())

imp_test = SimpleImputer(strategy="most_frequent")
test_data = pd.DataFrame(data=imp_test.fit_transform(test_data),columns=columns_test)

print("List of Missed Values in Test data",test_data.isna().sum(),sep='\n') 

List of Missed Values in Test data
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [5]:
y_train = list(train_data["Survived"])
train_data.drop("Survived", inplace=True, axis=1) # حذف برچسب 
x_train = train_data
x_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,B96 B98,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.45,B96 B98,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C


In [6]:
encoder = LabelEncoder()
x_train['Sex'] = encoder.fit_transform(x_train['Sex'])
x_train['Embarked'] = encoder.fit_transform(x_train['Embarked'])

test_data['Sex'] = encoder.fit_transform(test_data['Sex'])
test_data['Embarked'] = encoder.fit_transform(test_data['Embarked'])

x_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,B96 B98,2
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,2
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,B96 B98,2
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0,B96 B98,2
887,888,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0,B42,2
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,29.699118,1,2,W./C. 6607,23.45,B96 B98,2
889,890,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0,C148,0


In [7]:
classifier = tree.DecisionTreeClassifier()
classifier = classifier.fit(x_train[basic_feature_names], y_train)

In [8]:
test_data["Survived"] = classifier.predict(test_data[basic_feature_names])
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,B57 B59 B63 B66,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,B57 B59 B63 B66,2,0
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,B57 B59 B63 B66,1,0
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,B57 B59 B63 B66,2,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,B57 B59 B63 B66,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",1,30.27259,0,0,A.5. 3236,8.05,B57 B59 B63 B66,2,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9,C105,0,1
415,1307,3,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.25,B57 B59 B63 B66,2,0
416,1308,3,"Ware, Mr. Frederick",1,30.27259,0,0,359309,8.05,B57 B59 B63 B66,2,0


In [9]:
test_data.to_csv("Predicted-Titanic.csv",index=False)