In [None]:
# Implement CustomTransformer
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [None]:
# load data 
train = pd.read_csv('../Kaggle/Challenges/data/titanic_train.csv') # training data
test = pd.read_csv('../Kaggle/Challenges/data/titanic_test.csv') # test data

In [None]:
X = train.drop('Survived', axis=1)
y = train['Survived']

In [None]:
X_train, X_valid, y_train, y_valid = \
    train_test_split(X, y, random_state=42, train_size=0.75)

In [None]:
class OutlierCleaner(BaseEstimator, TransformerMixin):
    
    def fit(self, X: pd.DataFrame, y=None):
        return self
    
    @staticmethod
    def get_max_min(s: pd.Series) -> ():
        """ Get the maximum and minimum values of the data """
        q1 = s.quantile(0.25)
        q3 = s.quantile(0.75)
        iqr = q3 - q1
        maximum = q3 + 1.5 * iqr
        minimum = q1 - 1.5 * iqr
        return maximum, minimum
    
    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """ Remove outliers from the data 
        clip the outliers in the range (25, 75) quantile -or+ 1.5 IQR
        arr = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        q1 = 2.75
        q3 = 7.25
        iqr = 4.5
        clipped_arr = np.clip(arr, q1 - 1.5 * iqr, q3 + 1.5 * iqr)
        output_arr = [2.75, 2.75, 3, 4, 5, 6, 7, 7, 7, 7]
        """
        for col in X.select_dtypes(include='number').columns:
            maximum, minimum = self.get_max_min(X[col])
            df[col] = np.clip(df[col], minimum, maximum)
        return df   

In [None]:
class FeatureTransformer(BaseEstimator, TransformerMixin):

    @staticmethod
    def split_ticket(ticket: str) -> pd.Series:
        """ Split Ticket with Destination and Ticket number """
        result = ['U', np.nan] # Default values
        if ' ' in ticket: 
            # sometimes we have 2 spaces, we need split only by second space
            if ticket.count(' ') > 1: # if there are more than 1 space (PC R 17757)
                result = ticket.rsplit(' ', 1)
            else:
                result = ticket.split(' ') # split by space (PC 17757)
        if ticket.isnumeric(): # if ticket is only numbers (12345)
            result = ['U', ticket]
        if ticket.isalpha(): # if ticket is only letters (LINE)
            result = [ticket, np.nan]
            
        result[0] = result[0][0] # get first letter of Destination

        return pd.Series(result)


    def fit(self, X, y=None):
        return self
    
    def transform(self, X,  y=None):
        """ Custom Feature transformation """
        # Split Ticket with Destination and Ticket number
        X[['Destination', 'TicketNumber']] = X['Ticket'].apply(self.split_ticket)
        # Transform Destination into categorical
        X['Destination'] = X['Destination'].astype('category').cat.codes
        # Transform Cabin into Boolean
        X['Cabin'] = X['Cabin'].notna()
        # Cut Family Size into groups
        family_group = ['Alone', 'Small', 'Middle', 'Big']
        X['FamilySize'] = pd.cut(X['SibSp'] + X['Parch'] + 1, # Calculate Family Size
                                         [0, 1, 4, 7, 11], # Define Family Size Groups
                                         labels=family_group) # Assign Family Size Groups

        X['FamilySize'] = X['FamilySize'].astype('category').cat.codes

        return X


In [None]:

norm_columns = ['Age', 'Fare']

col_transformer = ColumnTransformer(transformers=
    [
        ('normalizer', Normalizer(), norm_columns),
        ('outlier_cleaner', OutlierCleaner(), ['Age', 'Fare'])
        ('pass', 'passthrough', ['Pclass', 'SibSp']) # TODO: check how this works
    ],
    remainder='drop', n_jobs=-1
)

In [None]:
pipeline = make_pipeline(col_transformer,
                         FeatureTransformer(),
                         KNeighborsClassifier() 
                         )

X_train = pipeline.fit_transform(X_train)

df = pd.DataFrame(X_train)

In [None]:
class FeatureScaler(BaseEstimator, TransformerMixin):
    
    def fit(self, df, *args, **kwargs):
        df = pd.get_dummies(df)
        self.columns = df.columns
        self.scaler = MinMaxScaler()
        self.scaler.fit(df)
        return self
    
    def transform(self, df):
        df = pd.get_dummies(df)
        df = df.reindex(columns=self.columns, fill_value=0)
        df_scaled = self.scaler.transform(df)
        df = pd.DataFrame(df_scaled, columns=self.columns)
        return df
    
    def inverse_transform(self, df):
        df_scaled = self.scaler.inverse_transform(df)
        df = pd.DataFrame(df_scaled, columns=self.columns)
        return df
    