In [1]:
import pandas as pd
import datetime as dp
import numpy as np

In [2]:
dataset_path = 'E:/Datasets/titanic/raw dataset'

In [3]:
train_raw = pd.read_csv(f'{dataset_path}/train.csv')
test_raw = pd.read_csv(f'{dataset_path}/test.csv')

# Summary

In [4]:
train_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
def count_infs(series):
    if pd.api.types.is_numeric_dtype(series):
        return np.isinf(series).sum()
    return 0

summary = pd.DataFrame({
    'dtype': train_raw.dtypes,
    'n_nans': train_raw.isna().sum(),
    'n_infs': train_raw.apply(count_infs)
})
summary.reset_index(inplace=True)
summary.rename(columns={'index': 'column'}, inplace=True)
print(summary)

         column    dtype  n_nans  n_infs
0   PassengerId    int64       0       0
1      Survived    int64       0       0
2        Pclass    int64       0       0
3          Name   object       0       0
4           Sex   object       0       0
5           Age  float64     177       0
6         SibSp    int64       0       0
7         Parch    int64       0       0
8        Ticket   object       0       0
9          Fare  float64       0       0
10        Cabin   object     687       0
11     Embarked   object       2       0


Same for the test set

In [13]:
summary = pd.DataFrame({
    'dtype': test_raw.dtypes,
    'n_nans': test_raw.isna().sum(),
    'n_infs': test_raw.apply(count_infs)
})
summary.reset_index(inplace=True)
summary.rename(columns={'index': 'column'}, inplace=True)
print(summary)

         column    dtype  n_nans  n_infs
0   PassengerId    int64       0       0
1        Pclass    int64       0       0
2          Name   object       0       0
3           Sex   object       0       0
4           Age  float64      86       0
5         SibSp    int64       0       0
6         Parch    int64       0       0
7        Ticket   object       0       0
8          Fare  float64       1       0
9         Cabin   object     327       0
10     Embarked   object       0       0


In [6]:
all_columns = train_raw.columns
feature_names = all_columns[~np.isin(all_columns, ['PassengerId', 'Name'])] #All columns that will be used for prediction

In [7]:
#Numerical data
numerical_cols = train_raw[feature_names].describe().columns
train_raw[feature_names].describe()  

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
#Categorical data
categorical_cols = train_raw[feature_names].describe().columns
train_raw[feature_names].describe(include='object')  

Unnamed: 0,Sex,Ticket,Cabin,Embarked
count,891,891,204,889
unique,2,681,147,3
top,male,347082,G6,S
freq,577,7,4,644


In [34]:
train_raw.duplicated().sum()

np.int64(0)

In [35]:
test_raw.duplicated().sum()

np.int64(0)

No duplicated values

# Preprocessing

I'll start wrangling the dataset a little bit.
Lets map the sex column to a numerical value:

In [24]:
train_w = train_raw.copy()
test_w = test_raw.copy()
train_w['Sex'] = train_w['Sex'].map({'male': 0, 'female': 1})
test_w['Sex'] = test_w['Sex'].map({'male': 0, 'female': 1})

There are three columns with missing data. I could simply drop the two rows for the embarked column, it wont do much difference.

In [25]:
train_w = train_w.dropna(subset=['Embarked'])

In [26]:
train_w.loc[:, 'Cabin'] = train_w['Cabin'].fillna('unknown')
test_w.loc[:, 'Cabin'] = test_w['Cabin'].fillna('unknown')

The age column has a lot of nans, and I don't know how could this be afecting.

In [27]:
print(train_w[numerical_cols].corr())

          Survived    Pclass       Age     SibSp     Parch      Fare
Survived  1.000000 -0.335549 -0.082446 -0.034040  0.083151  0.255290
Pclass   -0.335549  1.000000 -0.365902  0.081656  0.016824 -0.548193
Age      -0.082446 -0.365902  1.000000 -0.307351 -0.187896  0.093143
SibSp    -0.034040  0.081656 -0.307351  1.000000  0.414542  0.160887
Parch     0.083151  0.016824 -0.187896  0.414542  1.000000  0.217532
Fare      0.255290 -0.548193  0.093143  0.160887  0.217532  1.000000


I am going to impute missing values by training a small RandomForest predictor

In [28]:
from sklearn.ensemble import RandomForestRegressor
features = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Sex'] #Even if fare has a small correlation could be helpfull

train_age = train_w[train_w['Age'].notna()].copy()
test_age = train_w[train_w['Age'].isna()].copy()

X_train = train_age[features]
y_train = train_age['Age']
X_test = test_age[features]

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

predicted_ages = model.predict(X_test) #Predict missing values

train_w.loc[train_w['Age'].isna(), 'Age'] = predicted_ages #Fill the dataset, imputing the data

In [29]:
#And I repeat for the test set, using the same predictor
pred_age = test_w[test_w['Age'].isna()].copy()
X_pred = pred_age[features]

predicted_ages = model.predict(X_pred)

test_w.loc[test_w['Age'].isna(), 'Age'] = predicted_ages

In [33]:
#And finally I won get myself preocupied for the only one NaN fare value, i'll replace it with the mean 
test_w.loc[test_w['Fare'].isna(), 'Fare'] = 32.204208

# Univariate analysis