In [53]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import RobustScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.stats import skew, skewtest
from sklearn.preprocessing import PowerTransformer
from scipy.stats.mstats import winsorize
from sklearn.metrics import fbeta_score, make_scorer     
# add F2 metric
f2_score = make_scorer(fbeta_score, beta=2)

In [55]:
# Load dataset
ml_df = pd.read_excel('../data/E Commerce Dataset.xlsx', sheet_name=1)

In [56]:
# create function to inspect df
def inspect_dataframe(df):
    summary = {
        'ColumnName': df.columns.values.tolist(),
        'Nrow': df.shape[0],
        'DataType': df.dtypes.values.tolist(),
        'NAPct': (df.isna().mean() * 100).round(2).tolist(),
        'DuplicatePct': (df.duplicated().sum()/len(df)*100).round(2),
        'UniqueValue': df.nunique().tolist(),
        'Sample': [df[col].unique() for col in df.columns]
    }
    return pd.DataFrame(summary)

inspect_dataframe(ml_df)

Unnamed: 0,ColumnName,Nrow,DataType,NAPct,DuplicatePct,UniqueValue,Sample
0,CustomerID,5630,int64,0.0,0.0,5630,"[50001, 50002, 50003, 50004, 50005, 50006, 500..."
1,Churn,5630,int64,0.0,0.0,2,"[1, 0]"
2,Tenure,5630,float64,4.69,0.0,36,"[4.0, nan, 0.0, 13.0, 11.0, 9.0, 19.0, 20.0, 1..."
3,PreferredLoginDevice,5630,object,0.0,0.0,3,"[Mobile Phone, Phone, Computer]"
4,CityTier,5630,int64,0.0,0.0,3,"[3, 1, 2]"
5,WarehouseToHome,5630,float64,4.46,0.0,34,"[6.0, 8.0, 30.0, 15.0, 12.0, 22.0, 11.0, 9.0, ..."
6,PreferredPaymentMode,5630,object,0.0,0.0,7,"[Debit Card, UPI, CC, Cash on Delivery, E wall..."
7,Gender,5630,object,0.0,0.0,2,"[Female, Male]"
8,HourSpendOnApp,5630,float64,4.53,0.0,6,"[3.0, 2.0, nan, 1.0, 0.0, 4.0, 5.0]"
9,NumberOfDeviceRegistered,5630,int64,0.0,0.0,6,"[3, 4, 5, 2, 1, 6]"


In [57]:
# regularize labels
print('Before regularize labels')
print(ml_df['PreferredPaymentMode'].unique())
ml_df['PreferredPaymentMode'].replace({'CC': 'Credit Card', 'COD': 'Cash on Delivery'}, inplace=True)
print('\nAfter regularize labels')
print(ml_df['PreferredPaymentMode'].unique())

Before regularize labels
['Debit Card' 'UPI' 'CC' 'Cash on Delivery' 'E wallet' 'COD' 'Credit Card']

After regularize labels
['Debit Card' 'UPI' 'Credit Card' 'Cash on Delivery' 'E wallet']


In [58]:
# Check right/positively skewed columns
right_skewed_columns = {}
for col in ml_df.select_dtypes(include=['number']).columns:
    skewness = skew(ml_df[col].dropna())
    if skewness > 0.5:  # Adjust threshold as needed
        right_skewed_columns[col] = skewness

# Specify columns with missing values
cols_with_missingvalues = ml_df.columns[ml_df.isna().any()]

# Specify numerical columns
cols_numeric = ml_df.select_dtypes(include='number').columns

# Specify columns with right skew
cols_right_skewed = list(right_skewed_columns.keys())

# Specify categorical columns
cols_categorical = ml_df.select_dtypes(include='object').columns

# Specify columns with outliers
cols_with_outlier = ['Tenure', 'WarehouseToHome', 'HourSpendOnApp', 
                     'NumberOfDeviceRegistered', 'NumberOfAddress', 
                     'OrderAmountHikeFromlastYear', 'CouponUsed', 
                     'OrderCount', 'DaySinceLastOrder', 'CashbackAmount']

In [60]:
# Define a custom transformer for winsorization
class Winsorizer:
    def __init__(self, limits=(0.1, 0.9)):
        self.limits = limits
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(winsorize(X.values, limits=self.limits), columns=X.columns, index=X.index)
        else:
            return winsorize(X, limits=self.limits)

In [61]:
# preprocessing for missing values
na_handling= Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=42))
])

# preprocessing for outliers
outlier_handling = Pipeline(steps=[
    ('winsorizer', Winsorizer(limits=(0.1, 0.9)))
])

# preprocessing for numerical scaling
rescaling = Pipeline(steps=[
    ('scaler', RobustScaler())
])

# preprocessing for categorical encoding
encoding = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])

# combine preprocessing
custom_preprocessor = ColumnTransformer(
    transformers=[
        ('na_processor', na_handling, cols_with_missingvalues),
        ('outlier_processor', outlier_handling, cols_with_outlier),
        ('rescaler', rescaling, cols_numeric),
        ('encoder', encoding, cols_categorical)
    ])

# display preprocessor
display(preprocessor)

In [67]:
# Specify features and target
X = ml_df.drop(['Churn', 'CustomerID'], axis=1)
y = ml_df['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# check sample size for each set
print(f'The size of training set is {X_train.shape[0]} cols and {X_train.shape[1]} rows')
print(f'The size of testing set is {X_test.shape[0]} and {X_train.shape[1]} rows')

The size of training set is 4504 cols and 18 rows
The size of testing set is 1126 and 18 rows


In [69]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.model_selection import cross_validate, StratifiedKFold
from imblearn.pipeline import Pipeline as ImbPipeline

# Define a custom transformer for winsorization
class Winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, limits=(0.1, 0.9)):
        self.limits = limits
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(winsorize(X.values, limits=self.limits), columns=X.columns, index=X.index)
        else:
            return winsorize(X, limits=self.limits)

# Specify columns with missing values
cols_with_missingvalues = ml_df.columns[ml_df.isna().any()].tolist()

# Specify numerical columns
cols_numeric = ml_df.select_dtypes(include='number').columns.tolist()

# Specify columns with outliers
cols_with_outlier = ['Tenure', 'WarehouseToHome', 'HourSpendOnApp', 
                     'NumberOfDeviceRegistered', 'NumberOfAddress', 
                     'OrderAmountHikeFromlastYear', 'CouponUsed', 
                     'OrderCount', 'DaySinceLastOrder', 'CashbackAmount']

# Ensure that outlier columns and missing value columns are in the numeric columns list
cols_numeric = list(set(cols_numeric) - set(cols_with_missingvalues))

# Preprocessing for numerical columns with missing values
num_na_preprocessing = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=42))
])

# Preprocessing for numerical columns with outliers
num_outlier_preprocessing = Pipeline(steps=[
    ('winsorizer', Winsorizer(limits=(0.1, 0.9))),
    ('scaler', RobustScaler())
])

# Preprocessing for other numerical columns
num_preprocessing = Pipeline(steps=[
    ('scaler', RobustScaler())
])

# Preprocessing for categorical columns
cat_preprocessing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))
])

# Combine preprocessing for both numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num_na', num_na_preprocessing, cols_with_missingvalues),
        ('num_outlier', num_outlier_preprocessing, cols_with_outlier),
        ('num', num_preprocessing, cols_numeric),
        ('cat', cat_preprocessing, ml_df.select_dtypes(include='object').columns)
    ])

# Specify features and target
X = ml_df.drop(['Churn', 'CustomerID'], axis=1)
y = ml_df['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models
models = [LogisticRegression(random_state=42), KNeighborsClassifier(), DecisionTreeClassifier(random_state=42),
          RandomForestClassifier(random_state=42), GradientBoostingClassifier(random_state=42), SVC(random_state=42, probability=True),
          XGBClassifier(random_state=42), LGBMClassifier(random_state=42, verbose=-100), QuadraticDiscriminantAnalysis(), GaussianNB()]

model_names = ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'Gradient Boosting', 'SVM', 'XGBoost', 'LightGBM', 'QDA', 'Naive Bayes']

roc_auc_scores = []
f2_scores = []
mean_roc_aucs = []
mean_f2s = []
std_roc_aucs = []
std_f2s = []

# Custom F2 scorer
f2_scorer = make_scorer(fbeta_score, beta=2)

# Evaluate each model using StratifiedKFold cross-validation
for model in models:
    skfold = StratifiedKFold(n_splits=5)
    estimator = ImbPipeline([
        ('preprocess', preprocessor),
        ('model', model)
    ])
    
    scores = cross_validate(estimator, X_train, y_train, cv=skfold, scoring={'roc_auc': 'roc_auc', 'f2': f2_scorer})
    roc_auc_scores.append(scores['test_roc_auc'])
    f2_scores.append(scores['test_f2'])
    mean_roc_aucs.append(scores['test_roc_auc'].mean())
    mean_f2s.append(scores['test_f2'].mean())
    std_roc_aucs.append(scores['test_roc_auc'].std())
    std_f2s.append(scores['test_f2'].std())

results_df_kfold = pd.DataFrame({
    'Model': model_names,
    'Mean ROC AUC': mean_roc_aucs,
    'ROC AUC Std Dev': std_roc_aucs,
    'Mean F2 Score': mean_f2s,
    'F2 Score Std Dev': std_f2s
})

# Sort the results by mean F2 Score in descending order
results_df_sorted_kfold = results_df_kfold.set_index('Model').sort_values(by='Mean F2 Score', ascending=False)

results_df_sorted_kfold


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'CustomerID'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/sklearn/utils/__init__.py", line 448, in _get_column_indices
    col_idx = all_columns.get_loc(col)
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3804, in get_loc
    raise KeyError(key) from err
KeyError: 'CustomerID'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/imblearn/utils/fixes.py", line 85, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/imblearn/pipeline.py", line 322, in fit
    Xt, yt = self._fit(X, y, routed_params)
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/imblearn/pipeline.py", line 248, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/imblearn/pipeline.py", line 1097, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 724, in fit_transform
    self._validate_column_callables(X)
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 426, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
  File "/Users/adeliajanuarto/.pyenv/versions/3.9.16/lib/python3.9/site-packages/sklearn/utils/__init__.py", line 456, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe
