# Titanic Survival Prediction - Kaggle Competition

# Links
- [Titanic Kaggle Competition](https://www.kaggle.com/c/titanic/data)
- [Titanic Interactive Deckplans](https://www.encyclopedia-titanica.org/titanic-deckplans/d-deck.html)
- [Aquarel: An Open-Source Library for Matplotlib styling](https://github.com/lgienapp/aquarel)

### 1. Data Preprocess

Importing the data and converting it to a Pandas DataFrame for better readability.

In [54]:
import pandas as pd

df_train = pd.read_csv("Data/train.csv")
df_train.sample(15)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
554,555,1,3,"Ohman, Miss. Velin",female,22.0,0,0,347085,7.775,,S
665,666,0,2,"Hickman, Mr. Lewis",male,32.0,2,0,S.O.C. 14879,73.5,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
62,63,0,1,"Harris, Mr. Henry Birkhardt",male,45.0,1,0,36973,83.475,C83,S
378,379,0,3,"Betros, Mr. Tannous",male,20.0,0,0,2648,4.0125,,C
411,412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q
352,353,0,3,"Elias, Mr. Tannous",male,15.0,1,1,2695,7.2292,,C
497,498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S
329,330,1,1,"Hippach, Miss. Jean Gertrude",female,16.0,0,1,111361,57.9792,B18,C
80,81,0,3,"Waelens, Mr. Achille",male,22.0,0,0,345767,9.0,,S


In [55]:
# Adding the number of times a ticket was repeated to the dataframe
ticket_count = df_train["Ticket"].value_counts()
df_train["TicketCount"] = df_train["Ticket"].map(ticket_count)

# Adjusting the ticket price
df_train["AdjustedFare"] = df_train["Fare"] / df_train["TicketCount"]

# Introducing a new and more meaningful feature
df_train["Family"] = df_train["SibSp"] + df_train["Parch"]

# Extracting deck of residance for each passanger - N for NaN
df_train["Cabin"] = df_train["Cabin"].str[0]
df_train["Cabin"].fillna("N", inplace=True)

# Removing unnecessary feature
df_train.drop(columns=["Name", "Ticket", "SibSp", "Parch", "Fare", "TicketCount"], inplace=True, errors='ignore')

# Replacing missing age values with the average
df_train["Age"].fillna(int(df_train["Age"].mean()), inplace=True)

df_train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Cabin,Embarked,AdjustedFare,Family
212,213,0,3,male,22.0,N,S,7.25,0
7,8,0,3,male,2.0,N,S,5.26875,4
77,78,0,3,male,29.0,N,S,8.05,0
594,595,0,2,male,37.0,N,S,26.0,1
391,392,1,3,male,21.0,N,S,7.7958,0
494,495,0,3,male,21.0,N,S,8.05,0
479,480,1,3,female,2.0,N,S,12.2875,1
16,17,0,3,male,2.0,N,Q,5.825,5
696,697,0,3,male,44.0,N,S,8.05,0
620,621,0,3,male,27.0,N,C,7.2271,1


In [56]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   891 non-null    int64  
 1   Survived      891 non-null    int64  
 2   Pclass        891 non-null    int64  
 3   Sex           891 non-null    object 
 4   Age           891 non-null    float64
 5   Cabin         891 non-null    object 
 6   Embarked      889 non-null    object 
 7   AdjustedFare  891 non-null    float64
 8   Family        891 non-null    int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [57]:
import matplotlib.pyplot as plt
from aquarel import load_theme

theme = load_theme("boxy_dark")
theme.set_font(family="serif", serif="Times New Roman", size=12)
theme.apply()
fig, ([ax1, ax2, ax3], [ax4, ax5, ax6]) = plt.subplots(2, 3, figsize=(10, 8))

survived = pd.DataFrame(df_train["Survived"].value_counts())
ax1.bar(["Survived", "Deceased"], survived['count'].values, label="Survived")
ax1.set_title("Survival", loc="center")

pclass = pd.DataFrame(df_train["Pclass"].value_counts())
ax2.bar(pclass.index, pclass['count'].values, label="Passenger Class")
ax2.set_title("Passenger Class", loc="center")

gender = pd.DataFrame(df_train["Sex"].value_counts())
ax3.bar(gender.index, gender['count'].values, label="Gender")
ax3.set_title("Gender", loc="center")

ax4.hist(df_train['Age'], bins=20, label="Age")
ax4.set_title("Age", loc="center")

ax5.hist(df_train['AdjustedFare'], bins=15, label="Fare")
ax5.set_title("Fare", loc="center")

family = pd.DataFrame(df_train["Family"].value_counts())
ax6.bar(family.index, family['count'].values, label="Family")
ax6.set_title("Family Members Onboard", loc="center")

fig.tight_layout();
theme.apply_transforms()
fig.savefig("Figures/demo.png", dpi=300)

In [60]:
df_train["AdjustedFare"].sort_values(ascending=False).head(10)

527    221.7792
377    211.5000
679    170.7764
258    170.7764
737    170.7764
742    131.1875
311    131.1875
118    123.7604
299    123.7604
835     83.1583
Name: AdjustedFare, dtype: float64