# Model Building

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
from typing import List

Set a predetermined seed so all our results can be replicated

In [3]:
RANDOM_SEED = 1337

# Preprocessing

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
full_data = pd.read_csv('Dataset/clean_data.csv', index_col='Customer_ID')

In [6]:
#Separate target column with rest of the data
churn_col = full_data['churn'].copy()
full_data = full_data.drop('churn',axis=1)

In [7]:
#Separate data for training and testing with 80% for training and 20% testing
#Uses our preselected random seed to results are reproducible 
x_train, x_test, y_train, y_test = train_test_split(
    full_data,
    churn_col,
    test_size=0.2,
    random_state=RANDOM_SEED
)

In [8]:
with open('columnDescriptions.json','r') as f:
    col_desc = json.load(f)
    
#Shortened descriptions with elipses for plot titles
#Only retains first 20 characters of description then appends with elipses
short_col_desc = dict(zip(
    col_desc.keys(),
    map(lambda desc: 
        desc if len(desc)<20 else f'{desc[:20]}...', col_desc.values()
    )
))

We will use several different algorithms then compare their performance afterwards to determine which is the best to use. The algorithm we will use are: 
- Logistic Regression
- K Nearest Neighbor Classifier
- Random Forest
- XGBoost
- LightGBM

In [9]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [20]:
class PipelineFactory:
    
    def create_pipe(
            self, impute:bool, scale:bool,
            cat_cols:List[str], num_cols:List[str])->Pipeline:
        
        if impute & scale:
            return self.impute_ohe_scale(cat_cols,num_cols)
    
    def impute_ohe_scale(
            self, cat_cols:List[str], 
            num_cols:List[str])->Pipeline:
        pipe = make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            ColumnTransformer([
                ('ohe',OneHotEncoder(),cat_cols),
                ('scale',StandardScaler(),num_cols)
            ])
        )
        return pipe
    
    def impute_ohe(
            self,cat_cols:List[str], 
            num_cols:List[str])->Pipeline:
        return
    
    def ohe(
            self, cat_cols:List[str], 
            num_cols:List[str])->Pipeline:
        return
    


In [21]:
pf = PipelineFactory()
pipe = pf.create_pipe(True,True,full_data.select_dtypes(np.number).columns,full_data.select_dtypes('object').columns)

In [30]:
pipe.steps[0][1].fit_transform(full_data)

array([[1, 4085.0, 1602, ..., 6.333333333, 2, 0.0],
       [1, 26367.0, 14624, ..., 61.33333333, 1, 9.1],
       [1, 24303.05, 7888, ..., 2.666666667, 1, 0.0],
       ...,
       [1, 9234.0, 1238, ..., 0.0, 1, 0.0],
       [1, 20488.0, 9921, ..., 117.3333333, 1, 0.0],
       [1, 2225.0, 1231, ..., 19.33333333, 1, 0.4083333335]], dtype=object)

In [40]:
pipe.steps[0][1].

AttributeError: 'SimpleImputer' object has no attribute 'get_feature_names_out'

In [14]:
full_data.select_dtypes(np.number).columns

Index(['actvsubs', 'adjmou', 'adjqty', 'adjrev', 'adults', 'asl_flag',
       'attempt_mean', 'avg3mou', 'avg3qty', 'avg3rev', 'avg6mou', 'avg6qty',
       'avg6rev', 'avgmou', 'avgqty', 'avgrev', 'blck_dat_mean',
       'blck_vce_mean', 'callfwdv_mean', 'callwait_mean', 'cc_mou_mean',
       'ccrndmou_mean', 'change_mou', 'change_rev', 'comp_dat_mean',
       'comp_vce_mean', 'complete_mean', 'creditcd', 'custcare_mean',
       'da_mean', 'datovr_mean', 'drop_blk_mean', 'drop_dat_mean',
       'drop_vce_mean', 'dwlltype', 'eqpdays', 'forgntvl', 'has_kid',
       'hnd_price', 'income', 'infobase', 'inonemin_mean', 'iwylis_vce_mean',
       'kid0_2', 'kid11_15', 'kid16_17', 'kid3_5', 'kid6_10', 'lor', 'models',
       'months', 'mou_cdat_mean', 'mou_cvce_mean', 'mou_mean', 'mou_opkd_mean',
       'mou_opkv_mean', 'mou_pead_mean', 'mou_peav_mean', 'mou_rvce_mean',
       'mouiwylisv_mean', 'mouowylisv_mean', 'new_cell', 'numbcars',
       'opk_dat_mean', 'opk_vce_mean', 'ovrmou_mean', 'o