In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.model_selection import (
    train_test_split,
    TimeSeriesSplit,
    KFold,
    StratifiedKFold,
    GroupKFold,
    StratifiedGroupKFold)

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [4]:
def remap_column(df):
    '''
    remap the columns that have Yes/No in rows as 1/0
    '''
    remap = {'Yes':1,'No':0}
    df = df.map(remap)
    df = df.astype(int)
    return df

In [5]:
def clean_data(df):
    '''
    drop blanks, 
    reformat objects as floats/ints,
    drop ids,
    and remap Yes/No (and Gender) columns to booleans
    '''
    df = df[df['TotalCharges'] != '']
    df = df[df['TotalCharges'] != ' ']
    
    df['TotalCharges'] = df['TotalCharges'].str.strip()
    df['TotalCharges'] = df['TotalCharges'].astype(float)
    
    df['Partner'] = remap_column(df['Partner'])
    df['Dependents'] = remap_column(df['Dependents'])
    df['PhoneService'] = remap_column(df['PhoneService'])
    df['PaperlessBilling'] = remap_column(df['PaperlessBilling'])
    df['Churn'] = remap_column(df['Churn'])
    
    df = df.drop('customerID', axis = 1)
    
    remap_gender = {'Male':1,'Female':0}
    df['gender'] = df['gender'].map(remap_gender)
    df['gender'] = df['gender'].astype(int)
    
    return df

In [6]:
test_set = df.iloc[5635:] # for ensemble testing

df = df.iloc[:5634] # for weak learners

In [7]:
test_set = clean_data(test_set)
df = clean_data(df)

In [8]:
df = pd.get_dummies(df)
test_set = pd.get_dummies(test_set)

In [9]:
from sklearn.model_selection import train_test_split

# test_set (ensemble set)
X1 = test_set.drop('Churn', axis = 1)
y1 = test_set['Churn']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.2, random_state = 7)

# train/test one (weak learner set)
X = df.drop('Churn', axis = 1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 7)

In [10]:
from sklearn.preprocessing import RobustScaler
# scaling numerical columns of weak learner set

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# for ensemble set, all we need to pass is "X_test_ensemble = scaler.transform(X_test_ensemble)"
# this is to ensure the robust scaler is utilizing the same instance

In [11]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state = 21)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

In [12]:
# non-cross-validated baseline

from sklearn import metrics

print('Accuracy:',metrics.accuracy_score(y_test, rfc_pred))

Accuracy: 0.7973333333333333
