In [59]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import metrics
import seaborn as sns

In [3]:
data = pd.read_csv("titanic.csv")

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
#once again, we already preprocessed this file in a previous checkpoint, so we're gonna copy the code from there.

In [6]:
#let's now start the preprocessing phase.
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
#the "cabin" column has a lot of missing data.
#we're going to drop this column since it's not important
data.drop('Cabin', inplace=True, axis=1)

In [8]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [9]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [10]:
#the "age" column also has a lot of missing data but we can't drop it since it's important and cannot be treated.
#so we're going to fill in the missing data with the mean/average.
data["Age"].fillna(data["Age"].mean(), inplace=True)

In [11]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [12]:
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [13]:
#our data is now much cleaner and we only have two rows with missing data so we're simply going to delete them.
data.dropna(axis=0, how='any', thresh=None, inplace=True)

In [14]:
data.isnull().sum().sum()
#we can now be sure that our data is clean and is ready to be treated.
#the preprocessing phase is over.

0

In [75]:
#-----------New Stuff Starts Here------------#
#let's convert the male and female to 1s and 0s:
data.replace({'Sex':{'male': 1,'female':0}},inplace=True)

In [76]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,S


In [77]:
#again, we already established the most important features in previous checkpoints; which are age and sex.
from sklearn import tree   
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [78]:
x = data[["Age","Sex"]]
y = data["Survived"]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 10)
dtree = tree.DecisionTreeClassifier(criterion = 'gini', splitter = 'random', max_leaf_nodes=10, min_samples_leaf=5, max_depth=5)  
dtree.fit(x_train, y_train)   
y_pred=dtree.predict(x_test)
print("score:{}".format(accuracy_score(y_test, y_pred)))

score:0.8314606741573034


In [79]:
#we can see that we get pretty good results.
#let's now plot our tree to see what's going on.
import graphviz

In [80]:
dot_data = tree.export_graphviz(dtree,out_file=None,feature_names=None,class_names=data["Survived"],filled=True)                         
graph = graphviz.Source(dot_data, format="png") 
graph

TypeError: can only concatenate str (not "numpy.int64") to str

In [81]:
#let's now try some different parameters.
#we'll change the splitter to "best", max lf nodes to None, min samples lf to 1 and max depth to None:!
x = data[["Age","Sex"]]
y = data["Survived"]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 10)
dtree = tree.DecisionTreeClassifier(criterion = 'gini', splitter = 'best', max_leaf_nodes=None, min_samples_leaf=1, max_depth=None)  
dtree.fit(x_train, y_train)   
y_pred=dtree.predict(x_test)
print("score:{}".format(accuracy_score(y_test, y_pred)))

score:0.7696629213483146


In [83]:
#even though I expected better results, I got worse ones whcih are probably due to overfitting.

In [84]:
#let's now use a random forest.
from sklearn.ensemble import RandomForestClassifier 

In [85]:
x = data[["Age","Sex"]]
y = data["Survived"]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 10)
def Forest(trees):
    clf=RandomForestClassifier(n_estimators=trees) 
    clf.fit(x_train, y_train)
    y_pred=clf.predict(x_test)
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [86]:
Forest(5)

Accuracy: 0.8033707865168539


In [87]:
Forest(10)

Accuracy: 0.7921348314606742


In [101]:
Forest(10000)
#that took a while.

Accuracy: 0.7921348314606742


In [1]:
#we can clearly see that increasing the number of estimators does not mean more accuracy.
#and the best accuracy we got was with 5 estimators.
#Also we can clearly see that the variety the Random Forest does not simply mean more accuracy. As we got more accuracy with
#a simple decision tree. But, the decision tree model probably suffered from overfitting.

In [None]:
#this project is not yet complete and will get updated (i'm still working on the error with the visualization of the tree.)