# Data cleansing

In [1]:
import warnings
warnings.filterwarnings("ignore")
    
import numpy as np
from scipy import io
import pandas as pd

data_train = pd.read_csv("titanic_training.csv")
data_test = pd.read_csv("titanic_testing_data.csv")

data_train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0.0,3.0,male,,0.0,0.0,SOTON/OQ 392086,8.05,,S
1,0.0,1.0,male,22.0,0.0,0.0,PC 17760,135.6333,,C
2,0.0,2.0,male,23.0,0.0,0.0,SC/PARIS 2133,15.0458,,C
3,0.0,2.0,male,42.0,0.0,0.0,211535,13.0,,S
4,0.0,3.0,male,20.0,0.0,0.0,7534,9.8458,,S


In [2]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
survived    999 non-null float64
pclass      999 non-null float64
sex         999 non-null object
age         804 non-null float64
sibsp       999 non-null float64
parch       999 non-null float64
ticket      999 non-null object
fare        998 non-null float64
cabin       227 non-null object
embarked    997 non-null object
dtypes: float64(6), object(4)
memory usage: 78.2+ KB


In [3]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 9 columns):
pclass      310 non-null float64
sex         310 non-null object
age         242 non-null float64
sibsp       310 non-null float64
parch       310 non-null float64
ticket      310 non-null object
fare        310 non-null float64
cabin       68 non-null object
embarked    310 non-null object
dtypes: float64(5), object(4)
memory usage: 21.9+ KB


In [4]:
# find empty rows and drop them
def drop_empty_row(df):
    for i in range(len(df)):
        check_row = list(df.loc[i].isna())
        if False not in check_row:
            df.drop(i, inplace=True)
            df.reset_index(drop=True, inplace=True)

In [5]:
# fill empty enties in dataframe
def fill_empty(df):
    # mean value for age:
    mean_age = round(np.mean(df['age']))
    df['age'].fillna(mean_age, inplace=True)

    # mean value for fare
    mean_fare = np.mean(df['fare'])
    df['fare'].fillna(mean_fare, inplace=True)

    # cabin is the cabin number which consists of a alphanumerical combination (A-F) and number
    # I fill empty values with the letter G
    # I just omit the number and use only the alpabetic letter
    df['cabin'].fillna('G',inplace=True)
    for i in range(len(df.cabin)):
        df.cabin[i] = df.cabin[i][0] 

    # fill unknown embarkation with letter U (for unknown)
    df['embarked'].fillna('U', inplace=True)

In [6]:
# since almost all ticket numbers are different and seem to be random I will delete this feature
def drop_ticket(df):
    df.drop(columns='ticket', inplace=True)

In [7]:
# one-hot encode categorial values
def vectorize(df):
    # sex
    sex_dummies = pd.get_dummies(df['sex'])
    df.drop(columns='sex', inplace=True)

    for i in sex_dummies.columns:
        string = 'sex_'+i
        df[string] = sex_dummies[i]

    # embarked
    embarked_dummies = pd.get_dummies(df['embarked'])
    df.drop(columns='embarked', inplace=True)

    for i in embarked_dummies.columns:
        string = 'embarked_'+i
        df[string] = embarked_dummies[i]

    # cabin
    cabin_dummies = pd.get_dummies(df['cabin'])
    df.drop(columns='cabin', inplace=True)

    for i in cabin_dummies.columns:
        string = 'cabin_'+i
        df[string] = cabin_dummies[i]

    # pclass is numeric but the ordering has no meaning, so I better encode it as well
    pclass_dummies = pd.get_dummies(df['pclass'])
    df.drop(columns='pclass', inplace=True)

    for i in pclass_dummies.columns:
        string = 'pclass_'+str(i)
        df[string] = pclass_dummies[i]

In [8]:
# put all the defined functions into one single cleansing function
def cleansing(df):
    drop_empty_row(df)
    fill_empty(df)
    drop_ticket(df)
    vectorize(df)

In [10]:
# cleanse training data and export
cleansing(data_train)

# export
data_train.to_csv('titanic_clean_train.csv', index=False)

In [11]:
# cleanse testing data and export
cleansing(data_test)

# add missing columns to the dataframe
for column in list(data_train.columns):
    if column not in list(data_test.columns):
        data_test[column] = list(np.zeros(len(data_test)))

data_test = data_test[list(data_train.columns)]

# export
data_test.to_csv('titanic_clean_test.csv', index=False)