# Titanic Survival Prediction - Kaggle Competition

# Links
- [Titanic Kaggle Competition](https://www.kaggle.com/c/titanic/data)
- [Titanic Interactive Deckplans](https://www.encyclopedia-titanica.org/titanic-deckplans/d-deck.html)
- [Aquarel: An Open-Source Library for Matplotlib styling](https://github.com/lgienapp/aquarel)

### 1. Data Preprocess

Importing the data and converting it to a Pandas DataFrame for better readability.

In [79]:
import pandas as pd

df_train = pd.read_csv("Data/train.csv")
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [80]:
# Adding the number of times a ticket was repeated to the dataframe
ticket_count = df_train["Ticket"].value_counts()
df_train["TicketCount"] = df_train["Ticket"].map(ticket_count)

# Adjusting the ticket price
df_train["AdjustedFare"] = df_train["Fare"] / df_train["TicketCount"]

# Introducing a new and more meaningful feature
df_train["Family"] = df_train["SibSp"] + df_train["Parch"]

# Extracting deck of residance for each passanger - N for NaN
df_train["Deck"] = df_train["Cabin"].str[0]
df_train["Deck"].fillna("N", inplace=True)

# Removing unnecessary feature
df_train.drop(columns=["Name", "Ticket", "SibSp", "Parch", "Fare", "TicketCount", "Cabin"], inplace=True, errors='ignore')

# Replacing missing age values with the average
df_train["Age"].fillna(int(df_train["Age"].mean()), inplace=True)

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,AdjustedFare,Family,Deck
0,1,0,3,male,22.0,S,7.25,1,N
1,2,1,1,female,38.0,C,71.2833,1,C
2,3,1,3,female,26.0,S,7.925,0,N
3,4,1,1,female,35.0,S,26.55,1,C
4,5,0,3,male,35.0,S,8.05,0,N


In [81]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   891 non-null    int64  
 1   Survived      891 non-null    int64  
 2   Pclass        891 non-null    int64  
 3   Sex           891 non-null    object 
 4   Age           891 non-null    float64
 5   Embarked      889 non-null    object 
 6   AdjustedFare  891 non-null    float64
 7   Family        891 non-null    int64  
 8   Deck          891 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [82]:
import matplotlib.pyplot as plt
from aquarel import load_theme

theme = load_theme("boxy_dark")
theme.set_font(family="serif", serif="Times New Roman", size=12)
theme.apply()
fig, ([ax1, ax2, ax3], [ax4, ax5, ax6]) = plt.subplots(2, 3, figsize=(10, 8))

survived = pd.DataFrame(df_train["Survived"].value_counts())
ax1.bar(["Survived", "Deceased"], survived['count'].values, label="Survived")
ax1.set_title("Survival", loc="center")

pclass = pd.DataFrame(df_train["Pclass"].value_counts())
ax2.bar(pclass.index, pclass['count'].values, label="Passenger Class")
ax2.set_title("Passenger Class", loc="center")

gender = pd.DataFrame(df_train["Sex"].value_counts())
ax3.bar(gender.index, gender['count'].values, label="Gender")
ax3.set_title("Gender", loc="center")

ax4.hist(df_train['Age'], bins=20, label="Age")
ax4.set_title("Age", loc="center")

ax5.hist(df_train['AdjustedFare'], bins=15, label="Fare")
ax5.set_title("Fare", loc="center")

family = pd.DataFrame(df_train["Family"].value_counts())
ax6.bar(family.index, family['count'].values, label="Family")
ax6.set_title("Family Members Onboard", loc="center")

fig.tight_layout();
theme.apply_transforms()
fig.savefig("Figures/demo.png", dpi=300)