Link to challenge: https://www.kaggle.com/competitions/titanic/overview

In [1]:
import pandas as pd
import math
from sklearn.preprocessing import OneHotEncoder
import numpy as np


In [2]:
# Read the data
gender_df = pd.read_csv('data/gender_submission.csv')
test_df = pd.read_csv('data/test.csv')
train_df = pd.read_csv('data/test.csv')

In [3]:
# Exploring test_df
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [4]:
test_df.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
# storing passenger names and id
passenger_names = train_df[['PassengerId','Name']]
passenger_names = passenger_names.set_index(passenger_names['PassengerId'])
passenger_names = passenger_names.drop('PassengerId', axis=1)
passenger_names.head()

Unnamed: 0_level_0,Name
PassengerId,Unnamed: 1_level_1
892,"Kelly, Mr. James"
893,"Wilkes, Mrs. James (Ellen Needs)"
894,"Myles, Mr. Thomas Francis"
895,"Wirz, Mr. Albert"
896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)"


In [7]:
# removing passenger names as OHC would assign each one a random value, might attrbute to overfitting.
test_df = test_df.drop(['Name'], axis=1)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,330911,7.8292,,Q
1,893,3,female,47.0,1,0,363272,7.0,,S
2,894,2,male,62.0,0,0,240276,9.6875,,Q
3,895,3,male,27.0,0,0,315154,8.6625,,S
4,896,3,female,22.0,1,1,3101298,12.2875,,S


In [8]:
# use an encoder to transform Sex, Cabin, and Embarked to a real number.
for row, val in test_df.iterrows():
    if val['Sex'] == 'male':
        test_df.Sex[row] = 0
    if val['Sex'] == 'female':
        test_df.Sex[row] = 1
        
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.Sex[row] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.Sex[row] = 1


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,0,34.5,0,0,330911,7.8292,,Q
1,893,3,1,47.0,1,0,363272,7.0,,S
2,894,2,0,62.0,0,0,240276,9.6875,,Q
3,895,3,0,27.0,0,0,315154,8.6625,,S
4,896,3,1,22.0,1,1,3101298,12.2875,,S


In [10]:
# Encoding Cabin column.
ohc = OneHotEncoder()
ohc.fit(test_df[['Cabin']])
encoded_cabin_categories = ohc.categories_
x = ohc.transform(test_df[['Cabin']]).toarray()
encoded_cabin = pd.DataFrame(x)
encoded_cabin.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,67,68,69,70,71,72,73,74,75,76
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
# Encoding Embarked column.
ohc = OneHotEncoder()
ohc.fit(test_df[['Embarked']])
encoded_embarked_categories = ohc.categories_
x = ohc.transform(test_df[['Embarked']]).toarray()
encoded_embarked = pd.DataFrame(x)
encoded_embarked.head()

Unnamed: 0,0,1,2
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [13]:
# replace Cabin and Embarked columns with encoded columns
encoded_cabin_categories

[array(['A11', 'A18', 'A21', 'A29', 'A34', 'A9', 'B10', 'B11', 'B24',
        'B26', 'B36', 'B41', 'B45', 'B51 B53 B55', 'B52 B54 B56',
        'B57 B59 B63 B66', 'B58 B60', 'B61', 'B69', 'B71', 'B78', 'C101',
        'C105', 'C106', 'C116', 'C130', 'C132', 'C22 C26', 'C23 C25 C27',
        'C28', 'C31', 'C32', 'C39', 'C46', 'C51', 'C53', 'C54', 'C55 C57',
        'C6', 'C62 C64', 'C7', 'C78', 'C80', 'C85', 'C86', 'C89', 'C97',
        'D', 'D10 D12', 'D15', 'D19', 'D21', 'D22', 'D28', 'D30', 'D34',
        'D37', 'D38', 'D40', 'D43', 'E31', 'E34', 'E39 E41', 'E45', 'E46',
        'E50', 'E52', 'E60', 'F', 'F E46', 'F E57', 'F G63', 'F2', 'F33',
        'F4', 'G6', nan], dtype=object)]

In [None]:
# Transform the data

In [None]:
# Load the data