In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [3]:
class DropHighLowNa(TransformerMixin, BaseEstimator):
    def __init__(self, high_thresh=0.85, low_thresh=0.01):
        self.high_thresh = high_thresh
        self.low_thresh = low_thresh
        self.cols_to_drop = []
        self.rows_to_drop = []

    def fit(self, X, y=None):
        missing_ratio = X.isna().sum() / len(X)
        self.cols_to_drop = missing_ratio[missing_ratio > self.high_thresh].index.to_list()
        self.rows_to_drop = missing_ratio[missing_ratio < self.low_thresh].index.to_list()
        return self

    def transform(self, X):
        X = X.copy()
        X = X.dropna(subset=self.rows_to_drop, errors='ignore')
        X = X.drop(columns=self.cols_to_drop)
        return X

class ImputeCatCols(TransformerMixin, BaseEstimator):
    def __init__(self, neighbor, ord_cols=None, not_ord_cat_cols=None):
        self.neighbor = neighbor
        self.ord_cols = ord_cols
        self.not_ord_cat_cols = not_ord_cat_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if self.ord_cols:
            X[self.ord_cols] = X[self.ord_cols].fillna('NA')
        if self.not_ord_cat_cols:
            X[self.not_ord_cat_cols] = X.groupby(self.neighbor)[self.not_ord_cat_cols].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'NA'))
        return X

class CatEncoding(TransformerMixin, BaseEstimator):
    def __init__(self, ord_cols=None, freq_enc_cols=None):
        self.ord_cols = ord_cols
        self.freq_enc_cols = freq_enc_cols
        self.freq_map = {}
        self.ord_enc = OrdinalEncoder()

    def fit(self, X, y=None): 
        X = X.copy() 
        self.ord_enc.fit(X[self.ord_cols])
        for col in self.freq_enc_cols:
            freq = X[col].value_counts(normalize=True)
            self.freq_map[col] = freq.to_dict()
        return self

    def transform(self, X):
        X = X.copy()   
        if self.ord_cols: 
            X[self.ord_cols] = self.ord_enc.transform(X[self.ord_cols])

        if self.freq_enc_cols:
            for col in self.freq_enc_cols:
                X[col] = X[col].map(self.freq_map[col]).fillna(0)
        return X
        

In [None]:
num_transformer_logreg = Pipeline(steps=[
    ('imputer', IterativeImputer()),
    ('scaler', StandardScaler())
])

cat_transformer_logreg = Pipeline(steps=[
    ('imputer', ImputeCatCols()), # Pass in the neighbour column, ordinal_column list and non_ordinal column list
    ('encoder', CatEncoding()) # Pass in ordinal encoding columns and frequency encoding columns 
])

preprocessor_logreg = ColumnTransformer(
    transformers=[
        ('clean_nans', DropHighLowNa(high_thresh=0.85, low_thresh=0.01)),
        ('num', num_transformer_logreg, numerical_cols), 
        ('cat', cat_transformer_logreg, categorical_cols)
])

logreg_model = LogisticRegression(max_iter=1000, solver='lbfgs')

Pipeline_logreg = Pipeline(steps=[
        ('preprocessor', preprocessor_logreg),
        ('model', logreg_model)
])