In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Create a few ColumnTransformers and train them on the Titanic data. Here are a few ideas to work on:

impute the missing values in the Age column

one-hot-encode the Embarked column

bin the imputed Age column into 3 bins (young, middle-aged, old)

scale the Fare column


In [61]:
df = pd.read_csv('train.csv', index_col='PassengerId')
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [62]:
df.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [63]:
df.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [64]:
X = df[['Pclass','Sex','Age','Fare','Embarked']]
y = df['Survived']

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25,random_state= 42)

In [66]:
#X_train_trans.shape, X_test_trans.shape, y_train_trans.shape, y_test_trans.shape

In [67]:
pipeline_sex = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False, drop="if_binary")),
])

In [68]:
pipeline_age = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("binning", KBinsDiscretizer(n_bins=10, encode='onehot-dense', strategy='uniform'))
])

In [69]:
pipeline_embarked = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

In [70]:
pipeline_fare = Pipeline([
    ("binning", KBinsDiscretizer(n_bins=4, encode='onehot-dense', strategy='uniform'))
])

In [71]:
pipeline_class = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

In [72]:
transformer = ColumnTransformer(
    [
        ("sex", pipeline_sex, ['Sex']),
        ('fare', pipeline_fare, ['Fare']),
        ("age", pipeline_age, ['Age']),
        ("embarked", pipeline_embarked, ['Embarked']),
        ("class", pipeline_class, ['Pclass'])

    ]
)

In [73]:
train_FE = transformer.fit_transform(X_train)
#train_FE

In [74]:
test_FE = transformer.transform(X_test)
#valid_transform

In [75]:
clf = LogisticRegression()

In [76]:
clf.fit(train_FE, y_train)

LogisticRegression()

In [77]:
clf.predict(test_FE)

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0])

In [78]:
clf.predict_proba(test_FE)

array([[0.87389855, 0.12610145],
       [0.69399798, 0.30600202],
       [0.93300529, 0.06699471],
       [0.05748302, 0.94251698],
       [0.31972309, 0.68027691],
       [0.09029306, 0.90970694],
       [0.38950391, 0.61049609],
       [0.9228362 , 0.0771638 ],
       [0.35396084, 0.64603916],
       [0.15240105, 0.84759895],
       [0.65858614, 0.34141386],
       [0.93782812, 0.06217188],
       [0.52376974, 0.47623026],
       [0.79277064, 0.20722936],
       [0.77683086, 0.22316914],
       [0.13375386, 0.86624614],
       [0.71148786, 0.28851214],
       [0.38950391, 0.61049609],
       [0.69399798, 0.30600202],
       [0.71148786, 0.28851214],
       [0.93300529, 0.06699471],
       [0.65858614, 0.34141386],
       [0.52376974, 0.47623026],
       [0.93300529, 0.06699471],
       [0.93300529, 0.06699471],
       [0.9228362 , 0.0771638 ],
       [0.48977152, 0.51022848],
       [0.69399798, 0.30600202],
       [0.72358849, 0.27641151],
       [0.37777121, 0.62222879],
       [0.

In [79]:
clf.score(train_FE, y_train) 

0.8203592814371258

In [80]:
clf.score(test_FE, y_test)

0.8071748878923767

clf.score(X_train_FE, y_train_trans) 

0.8203592814371258

clf.score(X_test_FE, y_test_trans)

0.8071748878923767
