### Titanic - Machine Learning from Disaster

https://www.kaggle.com/competitions/titanic/data

In [102]:
import torch 
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelBinarizer

In [103]:
trainData = pd.read_csv("Data/train.csv")
testData = pd.read_csv("Data/test.csv")
len(trainData), len(testData)

(891, 418)

In [104]:
# Join both datasets in order to perform preprocessing
trainData["is_train"] = 1
testData["is_train"] = 0

data = trainData.append(testData, ignore_index=True); data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,0
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0


#### Dropping Useless variables

In [105]:
data = data.drop("Name", axis=1)

#### Treat the rows with na/null values:

In [106]:
print(data.isna().sum())

PassengerId       0
Survived        418
Pclass            0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
is_train          0
dtype: int64


In [107]:
# NA values for "Age"
print("Age 'NA/Total' rate: ", data["Age"].isna().sum() / len(data))

Age 'NA/Total' rate:  0.20091673032849502


In [108]:
data["Age"] = data["Age"].fillna(data["Age"].mean())

In [109]:
print("Cabin 'NA/Total' rate: ", data["Cabin"].isna().sum() / len(data))

Cabin 'NA/Total' rate:  0.774637127578304


In [110]:
# Since most of the values for "Cabin" are unknown, we can omit this column
data = data.drop("Cabin", axis=1)

In [111]:
# Fullfilling Fare values with mean
data["Fare"] = data["Fare"].fillna(data["Fare"].mean())

In [112]:
# Since the unknown "Embarked" values are from the test set, we can't its rows. Instead, we fulfill them with the most
#  common class
data["Embarked"] = data["Embarked"].replace(to_replace=np.nan,
                         value=data["Embarked"].value_counts(sort=True).index[0]);

#### First look into the data

In [113]:
"""correlations = Full_dataset_standard.corr()
best_atributes_by_corr = abs(correlations["status"]).sort_values(ascending=False).index[1:16]
correlations = correlations[best_atributes_by_corr].transpose()


plt.figure()
ax = sns.heatmap(pd.DataFrame(correlations["status"]), annot=True, linewidths=.5)"""

'correlations = Full_dataset_standard.corr()\nbest_atributes_by_corr = abs(correlations["status"]).sort_values(ascending=False).index[1:16]\ncorrelations = correlations[best_atributes_by_corr].transpose()\n\n\nplt.figure()\nax = sns.heatmap(pd.DataFrame(correlations["status"]), annot=True, linewidths=.5)'

####  Encode categorical attributes

In [114]:
# Dropping Ticket column (categorical but with mostly unique values)
data = data.drop("Ticket", axis=1);

In [115]:
# Label Binarizer for Ticket the 3 different ticket types
labBin = LabelBinarizer()
embarked_lab = pd.DataFrame(labBin.fit_transform(data["Embarked"]),
                           columns = ["Port_of_" + i for i in labBin.classes_],
                           index=data.index)
data = data.join(embarked_lab).drop("Embarked", axis=1)

In [116]:
# Binarizing the genre attribute
data["Sex"] = data["Sex"].replace("male", 0)
data["Sex"] = data["Sex"].replace("female", 1)

#### Split dataset, create DataLoaders

In [132]:
trainData = data[data["is_train"] == 1].drop("is_train", axis=1)
testData = data[data["is_train"] == 0].drop(["is_train", "Survived"], axis=1).reset_index()

In [133]:
trainData.info(), testData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    float64
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Port_of_C    891 non-null    int32  
 9   Port_of_Q    891 non-null    int32  
 10  Port_of_S    891 non-null    int32  
dtypes: float64(3), int32(3), int64(5)
memory usage: 73.1 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        418 non-null    int64  
 1   PassengerId  418 non-null    int64  
 2   Pclass    

(None, None)