# 1. Load data

In [1]:
import pandas as pd

In [2]:
df_passengers = pd.read_csv("./data/train.csv")

In [7]:
df_passengers.head(4)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [4]:
print("There is {} rows and {} columns in the dataframe".format(df_passengers.shape[0], df_passengers.shape[1]))

There is 891 rows and 12 columns in the dataframe


# 2. Explore and preprocessing

## 2.1. Passenger id

Passenger id is nothing more than the id in the dataframe, it does not bring any information, we can drop it

In [5]:
df_passengers.drop(labels=["PassengerId"], axis=1, inplace=True)

## 2.2. P class

P class is the ticket class, which can be see as a proxy for socio-economic status

In [6]:
df_passengers.Pclass.unique()

array([3, 1, 2], dtype=int64)

There is 3 differents categories, let get dummies from

In [8]:
df_passengers = pd.get_dummies(data=df_passengers, columns=["Pclass"], drop_first=True)

## 2.3. Sex

In [11]:
df_passengers.Sex.unique()

array(['male', 'female'], dtype=object)

There is 2 differents categories, let get dummies from

In [12]:
df_passengers = pd.get_dummies(data=df_passengers, columns=["Sex"], drop_first=True)

## 2.4. Age

Age is integer based. For children less than one years old, Age is fractionnal. If the Age is estimated, Age is in the format xx.5

In [14]:
df_passengers.Age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

First, let's see nan

In [46]:
print("There is {} rows without age".format(df_passengers.Age.isna().sum()))

There is 0 rows without age


Second, let's see the number of passenger with estimated age

In [47]:
print("There is {} rows with estimated age".format(df_passengers[df_passengers.Age.notna()].Age.apply(lambda x: (x - int(x)) == 0.5).sum()))

There is 18 rows with estimated age


For the moment, we will estimated missing value by the mean of the value which are present in the dataframe

In [42]:
mean_age = df_passengers[
    (df_passengers.Age.notna()) & 
    (df_passengers[df_passengers.Age.notna()].Age.apply(lambda x: (x - int(x)) != 0.5))
].Age.mean()

In [45]:
df_passengers.Age = df_passengers.Age.fillna(mean_age)

## 2.5. Sib sp

SibSp is the number of siblings / spouses aboard the Titanic

In [53]:
df_passengers.SibSp.unique()

array([1, 0, 3, 4, 2, 5, 8], dtype=int64)

## 2.6. Parch

Parch is the number of parents / children aboard the Titanic

In [54]:
df_passengers.Parch.unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)

## 2.7. Ticket

Ticket represent the ticket number

In [73]:
df_passengers.Ticket.unique()

array(['A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450',
       '330877', '17463', '349909', '347742', '237736', 'PP 9549',
       '113783', 'A/5. 2151', '347082', '350406', '248706', '382652',
       '244373', '345763', '2649', '239865', '248698', '330923', '113788',
       '347077', '2631', '19950', '330959', '349216', 'PC 17601',
       'PC 17569', '335677', 'C.A. 24579', 'PC 17604', '113789', '2677',
       'A./5. 2152', '345764', '2651', '7546', '11668', '349253',
       'SC/Paris 2123', '330958', 'S.C./A.4. 23567', '370371', '14311',
       '2662', '349237', '3101295', 'A/4. 39886', 'PC 17572', '2926',
       '113509', '19947', 'C.A. 31026', '2697', 'C.A. 34651', 'CA 2144',
       '2669', '113572', '36973', '347088', 'PC 17605', '2661',
       'C.A. 29395', 'S.P. 3464', '3101281', '315151', 'C.A. 33111',
       'S.O.C. 14879', '2680', '1601', '348123', '349208', '374746',
       '248738', '364516', '345767', '345779', '330932', '113059',
       'SO/C 14885', '31012

Ticket does not seem to bring information so let's drop it

In [75]:
df_passengers.drop(labels=["Ticket"], axis=1, inplace=True)

## 2.8. Fare

Fare report the fare of the ticket

In [65]:
df_passengers.Fare.unique()

array([  7.25  ,  71.2833,   7.925 ,  53.1   ,   8.05  ,   8.4583,
        51.8625,  21.075 ,  11.1333,  30.0708,  16.7   ,  26.55  ,
        31.275 ,   7.8542,  16.    ,  29.125 ,  13.    ,  18.    ,
         7.225 ,  26.    ,   8.0292,  35.5   ,  31.3875, 263.    ,
         7.8792,   7.8958,  27.7208, 146.5208,   7.75  ,  10.5   ,
        82.1708,  52.    ,   7.2292,  11.2417,   9.475 ,  21.    ,
        41.5792,  15.5   ,  21.6792,  17.8   ,  39.6875,   7.8   ,
        76.7292,  61.9792,  27.75  ,  46.9   ,  80.    ,  83.475 ,
        27.9   ,  15.2458,   8.1583,   8.6625,  73.5   ,  14.4542,
        56.4958,   7.65  ,  29.    ,  12.475 ,   9.    ,   9.5   ,
         7.7875,  47.1   ,  15.85  ,  34.375 ,  61.175 ,  20.575 ,
        34.6542,  63.3583,  23.    ,  77.2875,   8.6542,   7.775 ,
        24.15  ,   9.825 ,  14.4583, 247.5208,   7.1417,  22.3583,
         6.975 ,   7.05  ,  14.5   ,  15.0458,  26.2833,   9.2167,
        79.2   ,   6.75  ,  11.5   ,  36.75  ,   7.7958,  12.5

## 2.9. Cabin

Cabin represent the Cabin number

In [70]:
df_passengers.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

## 2.10. Embarked

Embarked is the port of embarkation

In [92]:
df_passengers.Embarked.unique()

array(['S', 'C', 'Q', 'M'], dtype=object)

In [78]:
df_passengers.Embarked.isna().sum()

2

For this moment, let's fill missing embarkation with 'M'

In [90]:
df_passengers[df_passengers.Embarked.isna()] = "M"

# 3. Feature engineering

In [None]:
df_passengers.head()

Unnamed: 0,Survived,Name,Age,SibSp,Parch,Fare,Cabin,Embarked,Pclass_2,Pclass_3,Sex_male
0,0,"Braund, Mr. Owen Harris",22.0,1,0,7.25,,S,0,1,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,C85,C,0,0,0
2,1,"Heikkinen, Miss. Laina",26.0,0,0,7.925,,S,0,1,0
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1,C123,S,0,0,0
4,0,"Allen, Mr. William Henry",35.0,0,0,8.05,,S,0,1,1
