In [1]:
# Importing required Libraries

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import warnings
warnings.simplefilter("ignore")

In [4]:
# Loading dataset
df = pd.read_csv("titanic/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Extrating categorical features

In [10]:
categorical_features = [feature for feature in df.columns if df[feature].dtype in ['o', 'O', 'onject', 'Object']]

In [11]:
categorical_features

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [27]:
# Creating a new dataset with sex, and embarked only
new_df = df[["Sex", "Embarked", "Survived"]]

In [28]:
new_df.head()

Unnamed: 0,Sex,Embarked,Survived
0,male,S,0
1,female,C,1
2,female,S,1
3,female,S,1
4,male,S,0


In [29]:
new_df.isnull().sum()

Sex         0
Embarked    2
Survived    0
dtype: int64

In [30]:
new_df["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [31]:
new_df["Embarked"].fillna("S", inplace=True)

In [25]:
# train test split

In [26]:
from sklearn.model_selection import train_test_split

In [45]:
X_train, X_test, y_train, y_test = train_test_split(new_df.iloc[:, :2], new_df.iloc[:,-1], test_size=0.2, random_state=42)

In [46]:
df.shape, X_train.shape, X_test.shape

((891, 12), (712, 2), (179, 2))

In [47]:
X_train

Unnamed: 0,Sex,Embarked
331,male,S
733,male,S
382,male,S
704,male,S
813,female,S
...,...,...
106,female,S
270,male,S
860,male,S
435,female,S


# Methods

In [23]:
# Orginal Encoder

In [24]:
from sklearn.preprocessing import OrdinalEncoder

In [62]:
ordinal_encoder = OrdinalEncoder(categories=[["Q", "C", "S"]])

In [65]:
X = np.asarray(X_train["Embarked"])
ordinal_encoder.fit(X.reshape(-1,1))

In [73]:
x_ordinal_transfer = ordinal_encoder.transform(np.asarray(X_train["Embarked"]).reshape(-1,1))

In [76]:
x_ordinal_transfer[:10]

array([[2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [1.],
       [2.],
       [2.]])

In [77]:
test_ordinal_transfer = ordinal_encoder.transform(np.asarray(X_test["Embarked"]).reshape(-1,1))

In [78]:
test_ordinal_transfer[:10]

array([[1.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [0.],
       [2.],
       [0.],
       [2.]])

In [79]:
# Label Encoder

In [80]:
from sklearn.preprocessing import LabelEncoder

In [81]:
label_encoder = LabelEncoder()

In [82]:
label_encoder.fit(X_train["Sex"])

In [85]:
label_encoder.classes_

array(['female', 'male'], dtype=object)

In [87]:
x_label_encoded = label_encoder.transform(X_train["Sex"])

In [88]:
x_label_encoded[:10]

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1])

In [89]:
test_label_encoded = label_encoder.transform(X_test["Sex"])

In [90]:
test_label_encoded[:10]

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0])

In [91]:
# One Hot Encoding

In [93]:
from sklearn.preprocessing import OneHotEncoder

In [94]:
onehot_encoder = OneHotEncoder(sparse=False)

In [95]:
onehot_encoder.fit(X_train)

In [96]:
train_onehot_encode = onehot_encoder.transform(X_train)

In [101]:
train_onehot_encode[:10]

array([[0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 1.]])

In [98]:
test_one_hot_encode = onehot_encoder.transform(X_test)

In [100]:
test_one_hot_encode[:10]

array([[0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1.]])