In [1]:
# Implement CustomTransformer
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# load data 
train = pd.read_csv('../Kaggle/Challenges/data/titanic_train.csv') # training data
test = pd.read_csv('../Kaggle/Challenges/data/titanic_test.csv') # test data

In [3]:
X = train.drop('Survived', axis=1)
y = train['Survived']

In [4]:
X_train, X_valid, y_train, y_valid = \
    train_test_split(X, y, random_state=42, train_size=0.75)

In [5]:
class CustomTransformer(BaseEstimator, TransformerMixin):

    def __init__(self,columns=None):
        self.columns = columns

    @staticmethod
    def split_ticket(ticket: str) -> pd.Series:
        """ Split Ticket with Destination and Ticket number """
        result = ['U', np.nan] # Default values
        if ' ' in ticket: 
            # sometimes we have 2 spaces, we need split only by second space
            if ticket.count(' ') > 1: # if there are more than 1 space (PC R 17757)
                result = ticket.rsplit(' ', 1)
            else:
                result = ticket.split(' ') # split by space (PC 17757)
        if ticket.isnumeric(): # if ticket is only numbers (12345)
            result = ['U', ticket]
        if ticket.isalpha(): # if ticket is only letters (LINE)
            result = [ticket, np.nan]
            
        result[0] = result[0][0] # get first letter of Destination

        return pd.Series(result)


    def fit(self, X, y=None):
        return self
    
    def transform(self, X,  y=None):
        # Transformation logic

        # Split Ticket with Destination and Ticket number
        X[['Destination', 'TicketNumber']] = X['Ticket'].apply(self.split_ticket)

        # Transform Destination into categorical
        X['Destination'] = X['Destination'].astype('category').cat.codes

        # Transform Cabin into Boolean
        X['Cabin'] = X['Cabin'].notna()

        # Cut Family Size into groups
        family_group = ['Alone', 'Small', 'Middle', 'Big']
        X['FamilySize'] = pd.cut(X['SibSp'] + X['Parch'] + 1, # Calculate Family Size
                                         [0, 1, 4, 7, 11], # Define Family Size Groups
                                         labels=family_group) # Assign Family Size Groups

        X['FamilySize'] = X['FamilySize'].astype('category').cat.codes

        return pd.DataFrame(X[self.columns], columns=self.columns)

In [6]:

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline

cust_cols = ['FamilySize', 'Destination', 'TicketNumber', 'Cabin', 'SibSp', 'Parch']
pass_through = ['']

cust_pipeline = Pipeline([                                        # This pipeline will fill missing numerical values with most frequent value
    ('custom', CustomTransformer())
])

preprocessor = make_column_transformer(
    (cust_pipeline, cust_cols),
)

# col_transformer = ColumnTransformer(transformers=
#     [
#         ('cust_pipeline',cust_pipeline, cust_cols), # Check if columns are need ?
#         ('passthrough', 'passthrough', pass_through)
#     ],
#     remainder='drop', n_jobs=-1
# )

X_train = preprocessor.fit_transform(X_train) # TODO: ValueError: A given column is not a column of the dataframe

df = pd.DataFrame(X_train)

ValueError: A given column is not a column of the dataframe