# Model Building

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from typing import List

Set a predetermined seed so all our results can be replicated

In [4]:
RANDOM_SEED = 1337

# Preprocessing

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
full_data = pd.read_csv('Dataset/clean_data.csv', index_col='Customer_ID')

In [7]:
#Separate target column with rest of the data
churn_col = full_data['churn'].copy()
full_data = full_data.drop('churn',axis=1)

Since we're finished with EDA in the other notebook, we can start splitting the data into training and testing sets. For models that need validation, we will utilize k-fold CV later on. 

In [8]:
#Separate data for training and testing with 80% for training and 20% testing
#Uses our preselected random seed to results are reproducible 
raw_x_train, raw_x_test, y_train, y_test = train_test_split(
    full_data,
    churn_col,
    test_size=0.2,
    random_state=RANDOM_SEED
)

In [9]:
with open('columnDescriptions.json','r') as f:
    col_desc = json.load(f)
    
#Shortened descriptions with elipses for plot titles
#Only retains first 20 characters of description then appends with elipses
short_col_desc = dict(zip(
    col_desc.keys(),
    map(lambda desc: 
        desc if len(desc)<20 else f'{desc[:20]}...', col_desc.values()
    )
))

We will use several different algorithms then compare their performance afterwards to determine which is the best to use. The algorithm we will use are: 
- Logistic Regression
- K Nearest Neighbor Classifier
- Random Forest
- XGBoost
- LightGBM

Since the implementations selected for the above algorithms have differing aptitudes for missing values and normalization, we will need different preprocessing pipelines of the data. For example, XGBoost and LightGBM can handle nan values (XGBoost learns whether to split nan values during training, while LightGBM allocates nan values to reduce loss afterwards) while the sklearn implementations of Logisitic regression, KNN classifier, and random forest cannot. 

In [46]:
from model_utils import PipelineFactory

In [47]:
pf = PipelineFactory(full_data)
pca_pipe = pf.create_pipe(pca=True,impute=True,normalize=True)
# impute_normalize_pipe = pf.create_pipe(impute=True,normalize=True)
# impute_pipe = pf.create_pipe(impute=True,normalize=False)
# ohe_pipe = pf.create_pipe(impute=False,normalize=False)

In [62]:
pca_pipe.transformer_list[0][1].steps[-1][1].get_feature_names_out()

array(['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7',
       'pca8', 'pca9', 'pca10', 'pca11', 'pca12', 'pca13', 'pca14',
       'pca15', 'pca16', 'pca17', 'pca18', 'pca19', 'pca20', 'pca21',
       'pca22', 'pca23', 'pca24', 'pca25', 'pca26', 'pca27', 'pca28',
       'pca29', 'pca30', 'pca31', 'pca32', 'pca33', 'pca34', 'pca35',
       'pca36', 'pca37', 'pca38', 'pca39', 'pca40', 'pca41', 'pca42',
       'pca43', 'pca44', 'pca45', 'pca46', 'pca47', 'pca48', 'pca49',
       'pca50', 'pca51', 'pca52', 'pca53', 'pca54', 'pca55', 'pca56',
       'pca57', 'pca58', 'pca59', 'pca60', 'pca61', 'pca62', 'pca63',
       'pca64', 'pca65', 'pca66', 'pca67', 'pca68', 'pca69', 'pca70',
       'pca71', 'pca72', 'pca73'], dtype=object)

# Model building

In [13]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgbm

## Logistic Regression

In [75]:
x_train = pca_pipe.fit_transform(raw_x_train)
x_test = impute_normalize_pipe.transform(raw_x_test)

In [18]:
# L2 regularized logistic regression
# 5 default chosen regularization strength
# 5 fold CV (80% training 20% validation)
log_reg = LogisticRegressionCV(
    Cs=
    random_state=RANDOM_SEED,
    class_weight='balanced'
)