In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.compose import ColumnTransformer

In [2]:
train = pd.read_csv('data/titanic/train.csv', index_col=0)
test = pd.read_csv('data/titanic/test.csv', index_col=0)
train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# Create a function to evaluate missing in data, check unique in object (categorical col)
def check_overview(df):

    check = []
    for col in df.columns:
        weird_obj = ''
        if df.dtypes[col] == 'object':
            type = 'Object'
            unique_obj = df[col].unique()
            for i in df[col]:
                obj = re.findall(r'[^A-Za-z\d\.\s\/\,]', str(i))
                if obj != []:
                    weird_obj += str(obj)
        else:
            type = 'Numeric'
            unique_obj = ''
        total_nan = df[col].isnull().sum()
        percent_nan = total_nan/len(df[col])*100
        check.append([col, type, total_nan, percent_nan, unique_obj, weird_obj])

    check_df = pd.DataFrame(check, columns=['Name', 'Type', 'Total_na', 'Percent', 'Unique', 'Weird_obj'])
    return check_df

check = check_overview(test)
check

Unnamed: 0,Name,Type,Total_na,Percent,Unique,Weird_obj
0,Pclass,Numeric,0,0.0,,
1,Name,Object,0,0.0,"[Kelly, Mr. James, Wilkes, Mrs. James (Ellen N...","['(', ')']['(', ')']['(', ')']['(', ')']['(', ..."
2,Sex,Object,0,0.0,"[male, female]",
3,Age,Numeric,86,20.574163,,
4,SibSp,Numeric,0,0.0,,
5,Parch,Numeric,0,0.0,,
6,Ticket,Object,0,0.0,"[330911, 363272, 240276, 315154, 3101298, 7538...",
7,Fare,Numeric,1,0.239234,,
8,Cabin,Object,327,78.229665,"[nan, B45, E31, B57 B59 B63 B66, B36, A21, C78...",
9,Embarked,Object,0,0.0,"[Q, S, C]",


In [5]:
def same_index_ticket_cabin(group, x):
    if np.sum(group['value']==x) != 0:
        return group[group['value']==x].index[0]
    else:
        return -1

def same_value_ticket_cabin(group, x):
    if np.sum(group['value']==x) != 0:
        return group[group['value']==x].num.values[0]
    else:
        return 0

cabin_concat = pd.concat([train['Cabin'],test['Cabin']])
cabin_g = pd.Series(cabin_concat.value_counts()[cabin_concat.value_counts()>1])\
                .reset_index()\
                .rename(columns={'index': 'value', 'Cabin': 'num'})

ticket_concat = pd.concat([train['Ticket'],test['Ticket']])
ticket_g = pd.Series(ticket_concat.value_counts()[ticket_concat.value_counts()>1])\
                .reset_index()\
                .rename(columns={'index': 'value', 'Ticket': 'num'})

In [6]:
name_path = r',\s([^.]*)\.'
cabin_path = r'(\w)'

def Attributes_Add(X, name_path, cabin_path, ticket_g, cabin_g):

    X['Name_title'] = X['Name'].str.extract(name_path)
    X['Cabin_class'] = X['Cabin'].fillna('U').str.extract(cabin_path)

    X['Same_ticket'] = X['Ticket'].map(lambda x: same_index_ticket_cabin(ticket_g, x))
    X['Num_same_ticket'] = X['Ticket'].map(lambda x: same_value_ticket_cabin(ticket_g, x))
    X['Ticket_alone'] = X['Num_same_ticket'].map(lambda x: 1 if x ==0 else 0)

    X['Same_cabin'] = X['Cabin'].map(lambda x: same_index_ticket_cabin(cabin_g, x))
    X['Num_same_cabin'] = X['Cabin'].map(lambda x: same_value_ticket_cabin(cabin_g, x))
    X['Cabin_alone'] = X['Num_same_cabin'].map(lambda x: 1 if x ==0 else 0)

    X['Sex_M'] = X['Sex'].map(lambda x: 1 if x=='male' else 0)
    X['Embarked'] = X['Embarked'].fillna('U')

    return X

train_new = Attributes_Add(train, name_path, cabin_path, ticket_g, cabin_g).drop(columns=['Name', 'Ticket', 'Cabin', 'Sex'])
test_new = Attributes_Add(test, name_path, cabin_path, ticket_g, cabin_g).drop(columns=['Name', 'Ticket', 'Cabin', 'Sex'])
train_new.info()
# test okie

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Survived         891 non-null    int64  
 1   Pclass           891 non-null    int64  
 2   Age              714 non-null    float64
 3   SibSp            891 non-null    int64  
 4   Parch            891 non-null    int64  
 5   Fare             891 non-null    float64
 6   Embarked         891 non-null    object 
 7   Name_title       891 non-null    object 
 8   Cabin_class      891 non-null    object 
 9   Same_ticket      891 non-null    int64  
 10  Num_same_ticket  891 non-null    int64  
 11  Ticket_alone     891 non-null    int64  
 12  Same_cabin       891 non-null    int64  
 13  Num_same_cabin   891 non-null    int64  
 14  Cabin_alone      891 non-null    int64  
 15  Sex_M            891 non-null    int64  
dtypes: float64(2), int64(11), object(3)
memory usage: 118.3+ KB


In [7]:
test_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Pclass           418 non-null    int64  
 1   Age              332 non-null    float64
 2   SibSp            418 non-null    int64  
 3   Parch            418 non-null    int64  
 4   Fare             417 non-null    float64
 5   Embarked         418 non-null    object 
 6   Name_title       418 non-null    object 
 7   Cabin_class      418 non-null    object 
 8   Same_ticket      418 non-null    int64  
 9   Num_same_ticket  418 non-null    int64  
 10  Ticket_alone     418 non-null    int64  
 11  Same_cabin       418 non-null    int64  
 12  Num_same_cabin   418 non-null    int64  
 13  Cabin_alone      418 non-null    int64  
 14  Sex_M            418 non-null    int64  
dtypes: float64(2), int64(10), object(3)
memory usage: 52.2+ KB


In [8]:
cate_col = ['Embarked', 'Name_title']

ordinal_encoder = OrdinalEncoder()

ordinal_encoder.fit(pd.concat([train_new[cate_col], test_new[cate_col]], axis=0))
train_new[cate_col] = ordinal_encoder.transform(train_new[cate_col])
test_new[cate_col] = ordinal_encoder.transform(test_new[cate_col])

In [9]:
train_new

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Name_title,Cabin_class,Same_ticket,Num_same_ticket,Ticket_alone,Same_cabin,Num_same_cabin,Cabin_alone,Sex_M
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,22.0,1,0,7.2500,2.0,12.0,U,-1,0,1,-1,0,1,1
2,1,1,38.0,1,0,71.2833,0.0,13.0,C,168,2,0,77,2,0,0
3,1,3,26.0,0,0,7.9250,2.0,9.0,U,-1,0,1,-1,0,1,0
4,1,1,35.0,1,0,53.1000,2.0,13.0,C,197,2,0,56,2,0,0
5,0,3,35.0,0,0,8.0500,2.0,12.0,U,-1,0,1,-1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,27.0,0,0,13.0000,2.0,15.0,U,-1,0,1,-1,0,1,1
888,1,1,19.0,0,0,30.0000,2.0,9.0,B,-1,0,1,-1,0,1,0
889,0,3,,1,2,23.4500,2.0,9.0,U,31,4,0,-1,0,1,0
890,1,1,26.0,0,0,30.0000,0.0,12.0,C,-1,0,1,-1,0,1,1


In [10]:
scale_col = ['Age', 'Fare']

std_scaler = StandardScaler()

std_scaler.fit(train_new[scale_col])
train_new[scale_col] = std_scaler.transform(train_new[scale_col])
test_new[scale_col] = std_scaler.transform(test_new[scale_col])
train_new

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Name_title,Cabin_class,Same_ticket,Num_same_ticket,Ticket_alone,Same_cabin,Num_same_cabin,Cabin_alone,Sex_M
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,-0.530377,1,0,-0.502445,2.0,12.0,U,-1,0,1,-1,0,1,1
2,1,1,0.571831,1,0,0.786845,0.0,13.0,C,168,2,0,77,2,0,0
3,1,3,-0.254825,0,0,-0.488854,2.0,9.0,U,-1,0,1,-1,0,1,0
4,1,1,0.365167,1,0,0.420730,2.0,13.0,C,197,2,0,56,2,0,0
5,0,3,0.365167,0,0,-0.486337,2.0,12.0,U,-1,0,1,-1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,-0.185937,0,0,-0.386671,2.0,15.0,U,-1,0,1,-1,0,1,1
888,1,1,-0.737041,0,0,-0.044381,2.0,9.0,B,-1,0,1,-1,0,1,0
889,0,3,,1,2,-0.176263,2.0,9.0,U,31,4,0,-1,0,1,0
890,1,1,-0.254825,0,0,-0.044381,0.0,12.0,C,-1,0,1,-1,0,1,1


In [11]:
imputation_col = ['Pclass', 'Sex_M', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Name_title', 'Same_ticket']

knn_imputer = KNNImputer(n_neighbors=3)
knn_imputer.fit(pd.concat([train_new[imputation_col], test_new[imputation_col]], axis=0))
train_new[imputation_col] = knn_imputer.transform(train_new[imputation_col])
test_new[imputation_col] = knn_imputer.transform(test_new[imputation_col])
train_new
# Result will be fill: Age, Embarked train, and Fare for test

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Name_title,Cabin_class,Same_ticket,Num_same_ticket,Ticket_alone,Same_cabin,Num_same_cabin,Cabin_alone,Sex_M
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3.0,-0.530377,1.0,0.0,-0.502445,2.0,12.0,U,-1.0,0,1,-1,0,1,1.0
2,1,1.0,0.571831,1.0,0.0,0.786845,0.0,13.0,C,168.0,2,0,77,2,0,0.0
3,1,3.0,-0.254825,0.0,0.0,-0.488854,2.0,9.0,U,-1.0,0,1,-1,0,1,0.0
4,1,1.0,0.365167,1.0,0.0,0.420730,2.0,13.0,C,197.0,2,0,56,2,0,0.0
5,0,3.0,0.365167,0.0,0.0,-0.486337,2.0,12.0,U,-1.0,0,1,-1,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2.0,-0.185937,0.0,0.0,-0.386671,2.0,15.0,U,-1.0,0,1,-1,0,1,1.0
888,1,1.0,-0.737041,0.0,0.0,-0.044381,2.0,9.0,B,-1.0,0,1,-1,0,1,0.0
889,0,3.0,-1.697569,1.0,2.0,-0.176263,2.0,9.0,U,31.0,4,0,-1,0,1,0.0
890,1,1.0,-0.254825,0.0,0.0,-0.044381,0.0,12.0,C,-1.0,0,1,-1,0,1,1.0


In [12]:
train_new[imputation_col].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       891 non-null    float64
 1   Sex_M        891 non-null    float64
 2   Age          891 non-null    float64
 3   SibSp        891 non-null    float64
 4   Parch        891 non-null    float64
 5   Fare         891 non-null    float64
 6   Embarked     891 non-null    float64
 7   Name_title   891 non-null    float64
 8   Same_ticket  891 non-null    float64
dtypes: float64(9)
memory usage: 69.6 KB


In [14]:
feature_selected = train.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin']).columns
feature_selected

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
       'Name_title', 'Cabin_class', 'Same_ticket', 'Num_same_ticket',
       'Ticket_alone', 'Same_cabin', 'Num_same_cabin', 'Cabin_alone', 'Sex_M'],
      dtype='object')

In [None]:
pi