In [193]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import SGDClassifier

In [194]:
titanic = pd.read_csv('titanic-passengers.csv', sep=';')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,343,No,2,"Collander, Mr. Erik Gustaf",male,28.0,0,0,248740,13.0,,S
1,76,No,3,"Moen, Mr. Sigurd Hansen",male,25.0,0,0,348123,7.65,F G73,S
2,641,No,3,"Jensen, Mr. Hans Peder",male,20.0,0,0,350050,7.8542,,S
3,568,No,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S
4,672,No,1,"Davidson, Mr. Thornton",male,31.0,1,0,F.C. 12750,52.0,B71,S


In [195]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

horrible method when we use directly

In [210]:
#model = make_pipeline(StandardScaler(), SGDClassifier()) #StandardScaler can not work with categorical value
#model.fit(X, y)

ValueError: could not convert string to float: 'Collander, Mr. Erik Gustaf'

# Solution(make_column_transformer)

In [220]:
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelBinarizer

In [229]:
y = titanic['Survived']
X = titanic.drop('Survived', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [230]:
numerical_features = ['Pclass', 'Age', 'Fare']
categorical_features = ['Sex', 'Embarked']

In [231]:
#définir une pipeline pour chaque type de variable
numerical_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

In [232]:
#injection pipeline dans make_column_transformer
preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                   (categorical_pipeline, categorical_features))

In [233]:
model = make_pipeline(preprocessor, SGDClassifier())
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7847533632286996

# make_column_selector

In [235]:
y = titanic['Survived']
X = titanic.drop('Survived', axis=1)

#tri variable numérique & catégorique(np.number & object)
numerical_features = make_column_selector(dtype_include=np.number) #['Pclass','Age','Fare'....]
categorical_features = make_column_selector(dtype_exclude=np.number) #['Sex','Embarked'....]

numerical_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                   (categorical_pipeline, categorical_features))
model.fit(X, y)
model.score(X, y)

0.7795275590551181

# Data cleaning

In [205]:
titanic=titanic.drop('Cabin', axis=1)

In [206]:
titanic['Age'].fillna(titanic['Age'].median(),inplace=True)

In [207]:
titanic['Embarked'].fillna(titanic['Embarked'].mode(),inplace=True)

In [208]:
titanic.dropna(axis=0,how='any',thresh=None,inplace=True)

In [209]:
titanic.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

# make_union

permet de créer des pipelines paralléles. les résultats sont concaténés à la sortie du transformer.

In [236]:
numerical_features = X[['Age', 'Fare']]

In [237]:
from sklearn.pipeline import make_union

In [238]:
pipeline = make_union(StandardScaler(), Binarizer())

In [239]:
#2 column for StanderScaler & 2 column for Binarizer; Dataset should be cleaned
pipeline.fit_transform(numerical_features).shape  

(889, 4)