# Data preprocess notebook

## importing libraries

In [1]:
import numpy as np 
import pandas as pd 
import sklearn
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

## INTRODUCTION

In [2]:
X_full=pd.read_csv("titanic/train.csv",index_col="PassengerId")
X=X_full.copy()

In [3]:
print(X_full.shape)

(891, 11)


In [4]:
X_full.dtypes

Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

### List of missing values per features

In [5]:
"""
list of missing values per features
"""
X_full.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

### numerical features

In [6]:
num_features=[col for col in X_full.columns if X_full.dtypes[col]!=object]
print("numerical features : ",num_features)


numerical features :  ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


### categorical features

In [7]:
cat_features=[col for col in X_full.columns if X_full.dtypes[col]==object]
print("categorical features : ",cat_features)

categorical features :  ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


### categorical features with missing values

In [8]:
cat_nan=[col for col in cat_features if X_full.isnull()[col].sum()!=0]
print("categorical features with nan : ",cat_nan)

categorical features with nan :  ['Cabin', 'Embarked']


### Numerical features with missing values

In [9]:
num_nan=[col for col in num_features if X_full.isnull()[col].sum()!=0]
print("numerical features with nan : ",num_nan)

numerical features with nan :  ['Age']


## Dealing with missing values

### Dropping the data

#### Dropping the rows

In [10]:
"""
dropping rows with missing values
"""
X=X_full.copy()
X.dropna(axis=0,inplace=True)
print("Old shape : ",X_full.shape[0])
print("New shape : ",X.shape[0])

Old shape :  891
New shape :  183


#### Dropping the features

In [11]:
"""
dropping features with missing values
"""
X=X_full.copy()

X.drop(num_nan+cat_nan,axis=1,inplace=True)
print("Deleted columns : ",num_nan+cat_nan)

Deleted columns :  ['Age', 'Cabin', 'Embarked']


### Imputing the data

#### Filling with 0,-9999

In [12]:
"""
filling with 0 the age column
"""
X=X_full.copy()
X["Age"].fillna(0,inplace=True)
X["Age"].value_counts().head()

0.0     177
24.0     30
22.0     27
18.0     26
28.0     25
Name: Age, dtype: int64

#### Filling with median

In [13]:
"""
filling with median the age column
"""
X=X_full.copy()
X["Age"].fillna(X["Age"].median(),inplace=True)
X["Age"].value_counts().head()

28.0    202
24.0     30
22.0     27
18.0     26
19.0     25
Name: Age, dtype: int64

#### Filling with mean

In [14]:
"""
filling with mean the age column
"""
X=X_full.copy()
X["Age"].fillna(X["Age"].mean(),inplace=True)
X["Age"].value_counts().head()

29.699118    177
24.000000     30
22.000000     27
18.000000     26
28.000000     25
Name: Age, dtype: int64

#### Filling with mice ;)

In [15]:
"""
filling all missing values with MICE
"""

X=X_full.copy()
from fancyimpute import IterativeImputer as MICE
X_num = pd.DataFrame(MICE(verbose=False).fit_transform(X[num_features]) ,columns=X[num_features].columns,index=X[num_features].index)
X=X[cat_features].join(X_num)
X["Age"].isnull().sum()

Using TensorFlow backend.


0

#### Filling with mode

In [16]:
"""
filling Cabin with mode (most frequent value)
"""
X=X_full.copy()
X["Cabin"].fillna(X["Cabin"].mode()[0],inplace=True)
X["Cabin"].value_counts().head()

B96 B98        691
C23 C25 C27      4
G6               4
D                3
E101             3
Name: Cabin, dtype: int64

### Assignement: Dealing with missing values

-fill Age feature missing values with mean

-drop Cabin feature (high rate of missing values)

-fill Embarked feature missing values with mode

In [17]:
"""
    -fill Age feature missing values with mean
    -drop Cabin feature
    -fill Embarked feature missing values with mode
    """
X_full.drop("Cabin",axis=1,inplace=True)
X_full['Age'] = X_full["Age"].fillna(X_full['Age'].mean())
X_full["Embarked"]=X_full["Embarked"].fillna(X_full["Embarked"].mode()[0])
cat_features.remove("Cabin")
X_full.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

## Encoding categorical features

### Label encoder

In [18]:
"""
we are going to use label encoder for all the categorical features
"""
X=X_full.copy()
for col in cat_features:
    lb=preprocessing.LabelEncoder()
    X[col]=lb.fit_transform(X[col])
print(X.dtypes)
X.head()

Survived      int64
Pclass        int64
Name          int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Ticket        int64
Fare        float64
Embarked      int64
dtype: object


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,108,1,22.0,1,0,523,7.25,2
2,1,1,190,0,38.0,1,0,596,71.2833,0
3,1,3,353,0,26.0,0,0,669,7.925,2
4,1,1,272,0,35.0,1,0,49,53.1,2
5,0,3,15,1,35.0,0,0,472,8.05,2


### One hot encoder

In [19]:
"""
we are going to use one hot encoder for all the categorical features
"""
X=X_full.copy()
from sklearn.preprocessing import OneHotEncoder
oh_enc=OneHotEncoder(handle_unknown="ignore")
oh_X=oh_enc.fit_transform(X[cat_features])
X1=pd.DataFrame(oh_X.toarray(),index=X.index)
X=X.drop(cat_features,axis=1).join(X1)
X.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,...,1567,1568,1569,1570,1571,1572,1573,1574,1575,1576
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,22.0,1,0,7.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,1,38.0,1,0,71.2833,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1,3,26.0,0,0,7.925,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1,35.0,1,0,53.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0,3,35.0,0,0,8.05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Target encoder

In [20]:
"""
we are going to use target encoder for all the categorical features
"""
from category_encoders import TargetEncoder
X=X_full.copy()
for col in cat_features:
    t_e=TargetEncoder()
    X[col]=X[col].apply(lambda x:str(x))
    X[col]=t_e.fit_transform(X[col],X["Survived"])
print(X["Sex"].value_counts())

0.188908    577
0.742038    314
Name: Sex, dtype: int64


### Assignement: Encoding categorical features

-one hot encoding : embarked sex (low number of unique values non ordinal variable)

-ticket and name should be dropped because encoding them doesn't add any useful information 


In [21]:
oh_cols=["Embarked","Sex"]

oh_enc=OneHotEncoder(handle_unknown="ignore")
oh_X=oh_enc.fit_transform(X[oh_cols])
X1=pd.DataFrame(oh_X.toarray(),index=X.index)
X_full=X_full.drop(cat_features,axis=1).join(X1)
X_full.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,22.0,1,0,7.25,1.0,0.0,0.0,1.0,0.0
2,1,1,38.0,1,0,71.2833,0.0,0.0,1.0,0.0,1.0
3,1,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
4,1,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
5,0,3,35.0,0,0,8.05,1.0,0.0,0.0,1.0,0.0


## Scaling numerical features

### MinMaxScaler

In [22]:
X_full.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,22.0,1,0,7.25,1.0,0.0,0.0,1.0,0.0
2,1,1,38.0,1,0,71.2833,0.0,0.0,1.0,0.0,1.0
3,1,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
4,1,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
5,0,3,35.0,0,0,8.05,1.0,0.0,0.0,1.0,0.0


In [23]:
from sklearn.preprocessing import MinMaxScaler
X=X_full.copy()
mm_scaler = MinMaxScaler()
X['Age'] = mm_scaler.fit_transform(X[['Age']])
print("age max =" , X['Age'].max()," age min=" , X['Age'].min())
X.head()

age max = 1.0  age min= 0.0


Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,0.271174,1,0,7.25,1.0,0.0,0.0,1.0,0.0
2,1,1,0.472229,1,0,71.2833,0.0,0.0,1.0,0.0,1.0
3,1,3,0.321438,0,0,7.925,1.0,0.0,0.0,0.0,1.0
4,1,1,0.434531,1,0,53.1,1.0,0.0,0.0,0.0,1.0
5,0,3,0.434531,0,0,8.05,1.0,0.0,0.0,1.0,0.0


### StandardScaler

In [24]:
from sklearn.preprocessing import StandardScaler
X=X_full.copy()
ss_scaler = StandardScaler()
X['Age'] = ss_scaler.fit_transform(X[['Age']])
print("mean =" , X['Age'].mean()," std=" , X['Age'].std())
X.head()

mean = 2.5627959662152535e-16  std= 1.0005616400330466


Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,-0.592481,1,0,7.25,1.0,0.0,0.0,1.0,0.0
2,1,1,0.638789,1,0,71.2833,0.0,0.0,1.0,0.0,1.0
3,1,3,-0.284663,0,0,7.925,1.0,0.0,0.0,0.0,1.0
4,1,1,0.407926,1,0,53.1,1.0,0.0,0.0,0.0,1.0
5,0,3,0.407926,0,0,8.05,1.0,0.0,0.0,1.0,0.0


## Train Test Split

In [25]:
y = X_full['Survived']
X = X_full.drop('Survived',axis = 1)

In [29]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [30]:
print(X_train.shape , y_train.shape)
print(X_test.shape,y_test.shape)

(623, 10) (623,)
(268, 10) (268,)
