In [43]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

import env
import acquire

In [44]:
df = acquire.get_telco_churn_data()

In [45]:
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,No,Yes,Yes,No,2,Yes,2,65.6,593.3,No
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,No,No,No,Yes,1,No,2,59.9,542.4,No
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,Yes,No,No,No,1,Yes,1,73.9,280.85,Yes
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,...,Yes,No,Yes,Yes,1,Yes,1,98.0,1237.85,Yes
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,...,No,Yes,Yes,No,1,Yes,2,83.9,267.4,Yes


In [46]:
df['churn'] = df['churn'].replace('Yes', 1)
df['churn'] = df['churn'].replace('No', 0)

In [47]:
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
df["total_charges"] = df["total_charges"].astype('float')
df = df.dropna()
df = df.drop(columns="customer_id")

In [48]:
X = df.drop(columns='churn')
y = df['churn']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges
463,Female,0,No,No,53,Yes,Yes,2,No,Yes,Yes,No,Yes,Yes,3,No,1,105.55,5682.25
5828,Male,0,No,No,1,Yes,No,3,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,1,No,2,20.25,20.25
1433,Male,0,No,No,1,Yes,No,3,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,1,No,2,21.1,21.1
2892,Male,1,No,No,71,Yes,Yes,2,No,Yes,No,Yes,Yes,Yes,3,Yes,1,106.8,7623.2
3915,Male,0,Yes,Yes,46,Yes,Yes,3,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,2,No,1,24.9,1174.8


In [50]:
def encode(X_train, X_test, col_name):

    encoded_values = sorted(list(X_train[col_name].unique()))

    # Integer Encoding
    int_encoder = LabelEncoder()
    X_train.encoded = int_encoder.fit_transform(X_train[col_name])
    X_test.encoded = int_encoder.transform(X_test[col_name])

    # create 2D np arrays of the encoded variable (in train and test)
    X_train_array = np.array(X_train.encoded).reshape(len(X_train.encoded),1)
    X_test_array = np.array(X_test.encoded).reshape(len(X_test.encoded),1)

    # One Hot Encoding
    ohe = OneHotEncoder(sparse=False, categories='auto')
    X_train_ohe = ohe.fit_transform(X_train_array)
    X_test_ohe = ohe.transform(X_test_array)

    # Turn the array of new values into a data frame with columns names being the values
    # and index matching that of train/test
    # then merge the new dataframe with the existing train/test dataframe
    X_train_encoded = pd.DataFrame(data=X_train_ohe,
                            columns=encoded_values, index=X_train.index)
    X_train = X_train.join(X_train_encoded)

    X_test_encoded = pd.DataFrame(data=X_test_ohe,
                               columns=encoded_values, index=X_test.index)
    X_test = X_test.join(X_test_encoded)

    return X_train, X_test

In [51]:
X_train, X_test = encode(X_train, X_test, 'internet_service_type_id')

In [52]:
X_train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,...,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,1,2,3
463,Female,0,No,No,53,Yes,Yes,2,No,Yes,...,Yes,Yes,3,No,1,105.55,5682.25,0.0,1.0,0.0
5828,Male,0,No,No,1,Yes,No,3,No internet service,No internet service,...,No internet service,No internet service,1,No,2,20.25,20.25,0.0,0.0,1.0
1433,Male,0,No,No,1,Yes,No,3,No internet service,No internet service,...,No internet service,No internet service,1,No,2,21.1,21.1,0.0,0.0,1.0
2892,Male,1,No,No,71,Yes,Yes,2,No,Yes,...,Yes,Yes,3,Yes,1,106.8,7623.2,0.0,1.0,0.0
3915,Male,0,Yes,Yes,46,Yes,Yes,3,No internet service,No internet service,...,No internet service,No internet service,2,No,1,24.9,1174.8,0.0,0.0,1.0


In [53]:
X_train['DSL'] = X_train[1]
X_train['Fiber Optic'] = X_train[2]
X_train['None'] = X_train[3]

In [54]:
X_test['DSL'] = X_test[1]
X_test['Fiber Optic'] = X_test[2]
X_test['None'] = X_test[3]

In [55]:
X_train = X_train.drop(columns='internet_service_type_id')
X_test = X_test.drop(columns='internet_service_type_id')

In [56]:
X_train = X_train.drop(columns=[1, 2, 3])
X_test = X_test.drop(columns=[1, 2, 3])

Encode contract type ID with label encoder and one hot encoder function

In [57]:
X_train, X_test = encode(X_train, X_test, 'contract_type_id')

In [58]:
X_train['Month-to-Month'] = X_train[1]
X_train['One Year'] = X_train[2]
X_train['Two Year'] = X_train[3]

In [59]:
X_test['Month-to-Month'] = X_test[1]
X_test['One Year'] = X_test[2]
X_test['Two Year'] = X_test[3]

In [60]:
X_train = X_train.drop(columns='contract_type_id')
X_test = X_test.drop(columns='contract_type_id')
X_train = X_train.drop(columns=[1, 2, 3])
X_test = X_test.drop(columns=[1, 2, 3])

In [61]:
X_train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,paperless_billing,payment_type_id,monthly_charges,total_charges,DSL,Fiber Optic,None,Month-to-Month,One Year,Two Year
463,Female,0,No,No,53,Yes,Yes,No,Yes,Yes,...,No,1,105.55,5682.25,0.0,1.0,0.0,0.0,0.0,1.0
5828,Male,0,No,No,1,Yes,No,No internet service,No internet service,No internet service,...,No,2,20.25,20.25,0.0,0.0,1.0,1.0,0.0,0.0
1433,Male,0,No,No,1,Yes,No,No internet service,No internet service,No internet service,...,No,2,21.1,21.1,0.0,0.0,1.0,1.0,0.0,0.0
2892,Male,1,No,No,71,Yes,Yes,No,Yes,No,...,Yes,1,106.8,7623.2,0.0,1.0,0.0,0.0,0.0,1.0
3915,Male,0,Yes,Yes,46,Yes,Yes,No internet service,No internet service,No internet service,...,No,1,24.9,1174.8,0.0,0.0,1.0,0.0,1.0,0.0
