## The Titanic Project

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_dir = "Titanic Data"

In [None]:
train_df = pd.read_csv(os.path.join(data_dir, "train.csv"))
train_df.head()

In [None]:
test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))
test_df.head()

### Primary data analysis


In [None]:
print(f"train dataset shape: {train_df.shape}")
print(f"test dataset shape : {test_df.shape}")

<br>The gender sumbission file contains only two column features.<br>
A closer look at the columns, it contains the <strong>survival status of the passengers</strong> i.e survived or passed away.<br>
It is also a subset of the <strong>test dataset</strong>.<br>
<br>
This implies that we will use it to evaluate our model accuracy


In [None]:
# Check description of both train and testing datasets

print("Training dataset description information")
display(train_df.describe(include = ['O'])) #this will get information on the object columns
display(train_df.describe())
print("*"*100)
print("Testing dataset decribe information")
display(test_df.describe(include= ['O']))
display(test_df.describe())

<h5>Quick Insights</h5>

We can already tell that there are <strong> missing values</strong> in both datasets <br>
However, lets see their count in a more explanatory way

In [None]:
# Number of null values for each column in train and test dataset 

print("Training data missing values")
display(train_df.isnull().sum())
print("*"*100)
print("Testing data missing values")
display(test_df.isnull().sum())

##### Missing Values insights
In the <strong>train data</strong> columns with missing values are:
<ul>
    <li> Age : 177 values</li>
    <li> Cabin : 678 Values</li>
    <li> Embarked : 2 Values</li>
   </ul>
In the <strong>test data</strong> columns with missing values are:
<ul>
    <li> Age : 86 values</li>
    <li> Cabin : 327 Values</li>
</ul>

In [None]:
# Checking for any duplicate rows


The <strong>Cabin column</strong> has a lot of missing values which means it can not be an influential feature in training the model.<br>
Therefore the best option is to drop it

In [None]:
# Dropping the Cabin Colunm

train_df = train_df.drop(['Cabin'], axis = 1)
test_df = test_df.drop(['Cabin'], axis = 1)

### Exploratory Data Analysis (EDA)
EDA is carrying out an indepth analysis of the data so as to discover hidden patterns, data anormalies, and test hypothesis/assumptions<br>
We will use statistical methods and visuals(graphical) 

We will first sort out the missing values

In [None]:
print('Train data: Missing age Values')
display(train_df.Age.isnull().value_counts())

print('-'*100)
print('Test data: Missing age Values')
display(test_df.Age.isnull().value_counts())


There are so many values of age that are missing.<br>
<ol>
<li> Train data: 177</li>
<li> Test Data: 86</li>
</ol>
Since age is a critical factor in the prediction, we cannot drop the column.<br>
As a result we need to take care of the missing values in either of the following ways:
<ul>
<li> Use mean age values</li>
<li> Try to allocate values randomly around the mean i.e choose random values between 25 and 75 percentile</li>
</ul>
<br>

In [None]:
print('Train data: Age description')
display(train_df.Age.describe())

print('-'*100)
print('Test data: Age description')
display(test_df.Age.describe())

We neeed to <strong>drop</strong> some columns as we do not need them.
<ul>
    <li>For the <strong>categorical</strong> data we need to drop the following columns</li>
    <ol>
        <li>Name</li>
        <li>Ticket</li>
    </ol>
    <li>For the <strong>numerical</strong> data we need to drop the following columns</li>
    <ol>
        <li>PassengerId</li>
        <li>Survived</li>
    </ol>
</ul> 

In [None]:
#  Dropping hte columns
cat_df = train_df[["Sex", "Pclass", "Embarked", "Parch","SibSp"]]
num_df = train_df[["Age", "Fare", "Parch","SibSp"]]

In [None]:
# plot numerical data

fig = plt.figure(figsize = (16,8))

for i in range(len(num_df.columns)):
    column = num_df.columns[i]
    sub = fig.add_subplot(2,2,i+1)
    chart = sns.boxplot(data = train_df, y = column, x = 'Survived', palette = 'PuBuGn')

In [None]:
# plot categorical data

fig = plt.figure(figsize = (16,8))

for i in range(len(cat_df.columns)):
    column = cat_df.columns[i]
    sub = fig.add_subplot(2,3,i+1)
    chart = sns.countplot(data = train_df, hue = 'Survived', y = column, palette = 'PuBuGn')

In [None]:
# gender difference on board the titanic
males_on_board = train_df.loc[train_df.Sex == 'male']
percentage_males = (len(males_on_board)/train_df.shape[0])*100
print(f"Number of males on board = {len(males_on_board)} which is {round(percentage_males, 2)}%")

females_on_board = train_df.loc[train_df.Sex == 'female']
percentage_females = (len(females_on_board)/train_df.shape[0])*100
print(f"Number of females on board = {len(females_on_board)} which is {round(percentage_females, 2)}%\n\n")

# Get survival rate for both females and male
male_survival = males_on_board.loc[males_on_board.Survived == 1]
print(f"% of male survival: {len(male_survival)/males_on_board.shape[0]*100} %")

female_survival = females_on_board.loc[females_on_board.Survived == 1]
print(f"% of female survival: {len(female_survival)/females_on_board.shape[0]*100} %\n")


In [None]:
# plot sex distribution on board

fig = plt.figure(figsize = (10,8))

fig.add_subplot(221)
plt.title('Sex Distribution on the titanic')
plt.pie(data = train_df["Sex"], x = train_df.Sex.value_counts(), 
        labels = ['Male','Female'], autopct = '%.2f%%')


fig.add_subplot(222)
plt.title("Survival distribution of both sex")
sns.violinplot(data = train_df, x=train_df["Sex"],
           y = train_df['Age'], hue = train_df["Survived"],
              split=True)


fig.add_subplot(212)
plt.title("Survival distribution of diffrent ages")
sns.histplot(data = train_df, x=train_df["Age"],
           kde = True, hue = train_df["Survived"],
              bins = 16, stat = 'count')

#### Survival distribution of both males and females

In [None]:
fig = plt.figure(figsize = (10,5))

# Survival rates for males
fig.add_subplot(121)
plt.title("Survival rates for males")
sns.histplot(data = train_df[train_df["Sex"] == "male"], 
            x = train_df["Age"], hue = train_df['Survived'], kde = True,
            bins = 16, stat = 'count')

# Survival rates for females
fig.add_subplot(122)
plt.title("Survival rates for females")
sns.histplot(data = train_df[train_df["Sex"] == "female"], 
            x = train_df["Age"], hue = train_df['Survived'],
            kde = True, bins = 12, stat = 'count')


## processing text data

In [None]:
display(train_df[['Name','Ticket']].head())

We can tell that a passangers tittle is oncluded in the Name.<br>
we need to extract the titles<br>
Later, we will need to drop the ticket as we don't have any meanngfll information we can get from it

### Extract title from name

In [None]:
# We will extract titles from both train and testing
import re

train_df['Title'] = train_df['Name'].str.extract(r'([A-Za-z]+)\.')
test_df['Title'] = test_df['Name'].str.extract(r'([A-Za-z]+)\.')

print("Titles in train data/n")
display(train_df.Title.value_counts())
print('Titles in the test data')
display(test_df.Title.value_counts())
print('*'*100)

# Check missing title values
print('Missing Title values in train data')
display(train_df.Title.isnull().value_counts())

print('Missing Title values in test data')
display(test_df.Title.isnull().value_counts())


All passangers had a title and as shown, no missing values in the titles<br><br>
We want to clean the titles so that we can have the fillowing categories:
<ul>
<li>Mr.</li>
<li>Mrs.</li>
<li>Miss.</li>
<li>Other.</li>
</ul>


In [None]:
# train data titles
for data in [train_df]:
    data['Title'] = data['Title'].replace(['Mr', 'Master', 'Capt', 'Sir', 'Don'], 'Mr')
    data['Title'] = data['Title'].replace(['Mrs', 'Countess', 'Lady'], 'Mrs')
    data['Title'] = data['Title'].replace(['Miss', 'Mlle', 'Ms'], 'Miss')
    data['Title'] = data['Title'].replace(['Dr', 'Rev', 'Mme', 'Col', 'Dona','Jonkheer', 'Major'], 'Other')

print('New titles for passangers onboard --train data')
display(train_df['Title'].value_counts())

In [None]:
# test data titles
for data in [test_df]:
    data['Title'] = data['Title'].replace(['Mr', 'Master', 'Capt', 'Sir', 'Don'], 'Mr')
    data['Title'] = data['Title'].replace(['Mrs', 'Countess', 'Lady'], 'Mrs')
    data['Title'] = data['Title'].replace(['Miss', 'Mlle', 'Ms'], 'Miss')
    data['Title'] = data['Title'].replace(['Dr', 'Rev', 'Mme', 'Col', 'Dona','Jonkheer', 'Major'], 'Other')

print('New titles for passangers onboard --test data')
display(test_df['Title'].value_counts())

#### Survival rates for titles

In [None]:
display(train_df[['Title', 'Survived']].groupby(['Title'], as_index=True).mean())

As expected Mrs and Miss should have a high survivaility rate as compated to the Mr and Others

Lets check the mean and median diffrence between train and testing data

In [None]:
centralized_df = pd.DataFrame(columns=['Feature', 'train_mean', 'test_mean', 'train_median', 'test_median'])

feature_list = ['Title', 'Embarked', 'Pclass', 'Sex']

for i in feature_list:
    for j in train_df[i].unique():
        new_df = pd.DataFrame({
            'Feature':[j],
            'train_mean': [format(train_df[train_df[i] == j]['Age'].mean(), '.1f')],
            'test_mean': [format(test_df[test_df[i] == j]['Age'].mean(), '.1f')],
            'train_median': [train_df[train_df[i] == j]['Age'].median()],
            'test_median': [test_df[test_df[i] == j]['Age'].median()]
        })
        centralized_df = pd.concat([centralized_df, new_df], ignore_index=True)

centralized_df['train_diff'] = centralized_df.train_mean.astype('float') - centralized_df.train_median
centralized_df['test_diff'] = centralized_df.test_mean.astype('float') - centralized_df.test_median

centralized_df