In [None]:
# Import Key Libraries
import pandas as pd #dataframe manipulation
import numpy as np #numerical analysis
import matplotlib.pyplot as plt 
import seaborn as sns


from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False) 
# setting the style of the notebook to be monokai theme  
# this line of code is important to ensure that we are able to see the x and y axes clearly
# If you don't run this code line, you will notice that the xlabel and ylabel on any plot is black on black and it will be hard to see them. 


In [None]:
# Read the data using pandas dataframe
titanic_df = pd.read_csv('titanic.csv')

In [None]:
# Show the data head!
titanic_df.head()


In [None]:
# Let's count the number of survivors and non-survivors
survived_df = titanic_df[titanic_df['Survived'] == 1]
no_survived_df = titanic_df[titanic_df['Survived'] == 0]


In [None]:
survived_df

In [None]:
# Count the survived and deceased 
print("Total =", len(titanic_df))

print("Number of passengers who survived =", len(survived_df))
print("Percentage Survived =", 1. * len(survived_df) / len(titanic_df) * 100.0, "%")
 
print("Number of passengers who did not Survive =", len(no_survived_df))
print("Percentage who did not survive =", 1. * len(no_survived_df) / len(titanic_df) * 100.0, "%")
         

In [None]:
# Bar Chart to indicate the number of people who survived based on their class
# If you are a first class, you have a higher chance of survival
plt.figure(figsize = [15,10])
plt.subplot(211)
sns.countplot(x = 'Pclass',data = titanic_df)

plt.subplot(212)
sns.countplot(x = 'Pclass',hue = 'Survived',data = titanic_df)

In [None]:
plt.figure(figsize = [15,10])
plt.subplot(211)
sns.countplot(x = 'SibSp',data = titanic_df)

plt.subplot(212)
sns.countplot(x = 'SibSp' , hue = 'Survived' ,data = titanic_df)

In [None]:
# Bar Chart to indicate the number of people survived based on their Parch status (how many parents onboard)
# If you have 1, 2, or 3 family members (Parch = 1,2), you have a higher chance of survival compared to being alone (Parch = 0)
plt.figure(figsize = [15, 10])
plt.subplot(211)
sns.countplot(x = 'Parch',data = titanic_df)

plt.subplot(212)
sns.countplot(x = 'Parch',hue = 'Survived',data = titanic_df)

In [None]:
# Bar Chart to indicate the number of people survived based on their sex
# If you are a female, you have a higher chance of survival compared to other ports!
plt.figure(figsize = [15, 10])
plt.subplot(211)
sns.countplot(x = 'Sex', data = titanic_df)
plt.subplot(212)
sns.countplot(x = 'Sex', hue = 'Survived', data = titanic_df)

In [None]:
 # Age Histogram 
plt.figure(figsize = [11,6])
titanic_df['Age'].hist(bins = 40)

In [None]:
# Let's explore which dataset is missing
sns.heatmap(titanic_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")
#plot heatmap (color blue indicates missing value)

In [None]:
# Let's drop the cabin coloumn(since it has too many missing values) and test with inplace = true and false
titanic_df.drop('Cabin',axis = 1,inplace = True)

In [None]:
# Let's drop the embarked, Ticket, passengerID, and Name as well
titanic_df.drop(['Name', 'Ticket', 'Embarked', 'PassengerId'], axis = 1, inplace = True)

In [None]:
titanic_df

In [None]:
# Let's view the data one more time!
sns.heatmap(titanic_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")


In [None]:
# Let's get the average age for male (~29) and female (~25)
plt.figure(figsize=(15, 10))
sns.boxplot(x = 'Sex', y = 'Age', data = titanic_df)

In [None]:
def Fill_Age(data):
    age = data[0]
    sex = data[1]

    if pd.isnull(age):
        if sex is 'male': 
            return 29
        else:
            return 25
    else:
        return age
        

In [None]:
titanic_df['Age'] = titanic_df[['Age','Sex']].apply(Fill_Age,axis = 1)

In [None]:
# Let's view the data one more time!
sns.heatmap(titanic_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")


In [None]:
# You just need one column only to represent male or female
pd.get_dummies(titanic_df['Sex'])

In [None]:
male = pd.get_dummies(titanic_df['Sex'], drop_first = True)


In [None]:
# first let's drop the embarked and sex 
titanic_df.drop(['Sex'], axis = 1, inplace = True)
titanic_df

In [None]:
# Now let's add the encoded column male again
titanic_df = pd.concat([titanic_df, male], axis = 1)
titanic_df

In [None]:
#Let's drop the target coloumn before we do train test split
X = titanic_df.drop('Survived', axis = 1).values
y = titanic_df['Survived'].values


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train,y_train)

In [None]:
y_predict_test = classifier.predict(X_test)
y_predict_test

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot = True, fmt = "d")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict_test))

In [None]:
from sklearn.naive_bayes import GaussianNB

# Build a Gaussian Classifier
model = GaussianNB()

# Model training
model.fit(X_train, y_train)

# Predict Output
predicted = model.predict(X_test)



In [None]:
cm_naive = confusion_matrix(y_test,predicted)
sns.heatmap(cm,annot = True)

print(classification_report(y_test, y_predict_test))

In [None]:
#approach 2
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [None]:
y_predict_test = classifier.predict(X_test)
y_predict_test

cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot = True, fmt = "d")

print(classification_report(y_test, y_predict_test))