In [1]:
# import all librairies needed for this model

import pandas as pd # for dataframe computaion
import numpy as np # vector computation
import matplotlib.pyplot as plt # for plot
import seaborn as sns # for plot
from sklearn.preprocessing import LabelEncoder, StandardScaler # for data preprocessing
from sklearn.linear_model import LogisticRegression  # for logistic regression
from sklearn.model_selection import train_test_split # for splitting and train and test datastet randomly
from sklearn.metrics import classification_report # for metrics and model evaluation
from sklearn.impute import SimpleImputer # for data preprocessing

# 1.  Import all librairies and datasets

In [3]:
# import dataset 
url_test = "datasets/test.csv"
url_train= "datasets/train.csv"

In [5]:
train_df = pd.read_csv(url_train)
test_df =  pd.read_csv(url_test)

# 2. Missing Values

In [6]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## 2.1 Find % of null values

In [17]:
np.round(train_df.isna().sum()/train_df.shape[0],2)

PassengerId    0.00
Survived       0.00
Pclass         0.00
Name           0.00
Sex            0.00
Age            0.20
SibSp          0.00
Parch          0.00
Ticket         0.00
Fare           0.00
Cabin          0.77
Embarked       0.00
dtype: float64

## 2.2 Columns with drop values

In [25]:
removed_col = ['PassengerId','Cabin','Name','Survived','Ticket']

## 2.3  Preselected Features

In [26]:
preselected_cols = [col for col in train_df.columns if col not in ['PassengerId','Cabin']]
preselected_cols

['Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Embarked']

## 2.4  Apply Simple Imputer

In [36]:
# SimpleImputer Embarked
train_df = train_df[preselected_cols]
simple_1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
values = simple_1.fit_transform(train_df[['Embarked']].values)
train_df['Embarked'] =  values

In [37]:
# SimpleImputer Age
simple_1 = SimpleImputer(missing_values=np.nan, strategy='median')
values = simple_1.fit_transform(train_df[['Age']].values)
train_df['Age'] =  values 

In [38]:
train_df.isna().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

# 3. Verify well data types

In [46]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int64(6), object(2)
memory usage: 69.7+ KB


# 4. Find Cateorigal values

In [39]:
col_cat = ['Survived','Pclass','Sex','SibSp','Parch','Embarked']
for col in col_cat:
    print(col, train_df[col].unique())

Survived [0 1]
Pclass [3 1 2]
Sex ['male' 'female']
SibSp [1 0 3 4 2 5 8]
Parch [0 1 2 5 3 4 6]
Embarked ['S' 'C' 'Q']


In [41]:
# code Sexe
map_sex = {'male':1,'female':0}
train_df['Sex'] = train_df['Sex'].replace(map_sex)

In [42]:
# code Embarked
embarked_map = {'S':0,'C':1,'Q':2}
train_df['Embarked'] = train_df['Embarked'].replace(embarked_map)

In [43]:
col_cat = ['Survived','Pclass','Sex','SibSp','Parch','Embarked']
for col in col_cat:
    print(col, train_df[col].unique())

Survived [0 1]
Pclass [3 1 2]
Sex [1 0]
SibSp [1 0 3 4 2 5 8]
Parch [0 1 2 5 3 4 6]
Embarked [0 1 2]


# 5. Find oultiers

In [48]:
Q1 = train_df.describe().T['25%']
Q3 = train_df.describe().T['75%']
train_IQR = Q3-Q1
train_IQR
MIN_RANGE = Q1 - 1.5 * IQR
MAX_RANGE = Q3+ 1.5 *IQR

Survived     1.0000
Pclass       1.0000
Sex          1.0000
Age         13.0000
SibSp        1.0000
Parch        0.0000
Fare        23.0896
Embarked     1.0000
dtype: float64