# Titanic - Machine Learning from Disaster

## 1. Load datasets

In [1]:
# Import modules
import kaggle
import pandas as pd
import numpy as np
import zipfile
import os

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Download the competition dataset using the API key in the .kaggle directory
competition_name = "titanic"
kaggle.api.competition_download_files(competition_name, path='.kaggle')

In [3]:
# Unzip the file
zip_file_path = '.kaggle/titanic.zip'
extracted_folder_path = '.kaggle/titanic_data'
os.makedirs(extracted_folder_path, exist_ok=True)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder_path)
print("Files extracted to:", extracted_folder_path)

In [4]:
from os import listdir
print(listdir(extracted_folder_path))

In [5]:
# Load datasets
train_df=pd.read_csv(extracted_folder_path+'/train.csv')
test_df=pd.read_csv(extracted_folder_path+'/test.csv')

In [6]:
# Viewing attributes
train_df.columns
# The Survived column use to identify who are surviving. An attribute for Machine learning to define a passenger die or live

In [7]:
test_df.columns

In [8]:
train_df.head() 

In [9]:
test_df.head()

In [10]:
train_df.set_index(train_df.PassengerId,inplace=True)
train_df.head()

In [11]:
train_df.drop('PassengerId', axis=1, inplace=True) #axis=1 => all rows, inplace => on this data

In [12]:
train_df.head()

In [13]:
test_df.set_index(test_df.PassengerId,inplace=True)
test_df.drop('PassengerId', axis=1, inplace=True) #axis=1 => all rows, inplace => on this data
test_df.head()

## 2. Features classification

In [14]:
train_df.info()

In [15]:
test_df.info()

In [16]:
features= ['Pclass','Sex','SibSp','Parch','Embarked']
def convert_2category (df, features): # df: dataframe, features: list
    for feature in features:
        df[feature]=df[feature].astype('category')
convert_2category(train_df,features)
convert_2category(test_df,features)

In [17]:
train_df.info()

### 2.1. Distribution of Numerical features

In [18]:
train_df.describe() # describe numerical features

### 2.2. Distribution of Categorical features

In [19]:
train_df.describe(include=['category'])

## 3. Exploratory Data Analysis (EDA)

### Correlating categorical features

Categorical: `Survived`, `Sex`, `Embarked`, `Pclass`(ordinal), `SibSp`, `Parch`

## `Survived`

In [20]:
train_df['Survived'].value_counts().to_frame()

In [21]:
train_df['Survived'].value_counts(normalize=True).to_frame()

only 38% survived the disaster. So the training data suffers from data imbalance but it is not severe which is why i will not consider techniques like sampling to tackle the imbalance

In [22]:
cats=['Sex','Embarked','Pclass','SibSp','Parch']
rows,cols,expand=2,3,3.5
fig,ax=plt.subplots(rows,cols,figsize=(cols*expand,rows*expand))
for r in range(rows):
    for c in range(cols):
        i=r*cols+c # index to loop through list cats
        if i <len(cats):
            ax_i=ax[r,c]
            sns.countplot(data=train_df, x=cats[i],hue='Survived', palette='Greens', ax=ax_i)
            ax_i.set_title(f"Figure {i+1}: Survival Rate vs {cats[i]}")
            ax_i.legend(title='',loc='upper right',labels=['Not Survived','Survived'])
ax.flat[-1].set_visible(False)
plt.tight_layout()
plt.show()

## Observation
Survival Rate:
- Fig 1: Female survival rate > male
- Fig 2: Most People embarked on Southampton, and also had the highest people not survived
- Fig 3: 1st class higher survival rate
- Fig 4: People going with 0 SibSp are mostly not survived. The number of passenger with 1-2 family members has a better chance of survival
- Fig 5: People going with 0 Parch are mostly not survived.

## 3.2. EDA for Numerical Features
- Numerical Features: `Age`, `Fare`

In [23]:
sns.histplot(data=train_df, x='Age', hue='Survived', bins= 60, kde= True);

## Fare

In [24]:
train_df["Fare"].describe()

In [25]:
sns.histplot(data=train_df, x='Fare', hue='Survived', bins=60, palette="Blues");

In [26]:
# To name for 0-25%, quartile, 25-50, 50-75, 75-100
fare_categories=['Economic', 'Standard', 'Expensive','Luxury']
quartile_data= pd.qcut(train_df['Fare'], 4, labels= fare_categories )
sns.countplot(x=quartile_data,hue=train_df['Survived'], palette="Greens")

In [27]:
train_df['Fare']

Distribution of Fare
- Fare does not follow a normal distribution and has a huge spike at the price range (0-100 dollars)
- The distribution is skewed to the left with `75%` of the fare paid under `31 dollars` and a max paid fare of `512 dollars`
Quartile plot
- Passenger wiht luxury & Expensive Fare will have more chance to survive

# Feature Engineering & Data Wrangling

## `Name`
- regular expression (regex101.com)

In [28]:
train_df['Name'].tail(10)

In [29]:
import re # regular expression
def extract_title(name):
    p=re.compile(r",([\w\s]+)\.")
    return p.search(name).groups(1)[0].strip()
train_df['title']=train_df['Name'].apply(lambda name: extract_title(name))

In [30]:
train_df['title'].value_counts()

In [31]:
test_df['title']=test_df['Name'].apply(lambda name: extract_title(name))

In [32]:
test_df['title'].value_counts()

In [33]:
sns.countplot(data=train_df, x= 'title',hue='Survived');

In [34]:
train_df['title'].apply(lambda title: title if title in ['Mr','Mrs','Ms',"Master"] else 'Other')

In [35]:
train_df['title'].value_counts()

In [36]:
def group_title(title):
    if title in ['Mr','Mrs','Ms',"Master"]:
        return title
    elif title =='Ms':
        return 'Miss'
    else:
        return 'Others'
train_df['title']=train_df['title'].apply(lambda title: group_title(title))
test_df['title']=test_df['title'].apply(lambda title: group_title(title))

In [37]:
train_df['title'].value_counts()

In [38]:
test_df['title'].value_counts()