# Predicting Telco customer churn using a neural network
Here is a peak of the feature set at our disposal.  
*Image inserted here*  


In [1]:
import pandas as pd
import torch
from torch import optim, nn
from pathlib import Path

In [2]:
raw_data = pd.read_csv(Path('./WA_Fn-UseC_-Telco-Customer-Churn.csv'), low_memory=False)
raw_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
raw_data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [4]:
cleaned_data = raw_data.drop(labels=['customerID'], axis=1)

### Let's see the possible values

In [5]:
for col in cleaned_data:
    print(cleaned_data[col].unique())

['Female' 'Male']
[0 1]
['Yes' 'No']
['No' 'Yes']
[ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
['No' 'Yes']
['No phone service' 'No' 'Yes']
['DSL' 'Fiber optic' 'No']
['No' 'Yes' 'No internet service']
['Yes' 'No' 'No internet service']
['No' 'Yes' 'No internet service']
['No' 'Yes' 'No internet service']
['No' 'Yes' 'No internet service']
['No' 'Yes' 'No internet service']
['Month-to-month' 'One year' 'Two year']
['Yes' 'No']
['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
[29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
['No' 'Yes']


#### Let's handle the ordinal values under 'Contract'

In [6]:
contract_order = 'Month-to-month', 'One year', 'Two year'
cleaned_data['Contract'] = cleaned_data['Contract'].astype('category')
cleaned_data['Contract'].cat.set_categories(contract_order, ordered=True, inplace=True)
cleaned_data.head()

  res = method(*args, **kwargs)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


#### Now, let's one-hot encode

In [7]:
categoricals = cleaned_data.columns.drop(['tenure', 'MonthlyCharges', 'TotalCharges'])
categoricals

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn'],
      dtype='object')

In [8]:
encoded_data = pd.get_dummies(cleaned_data[categoricals])
encoded_data.head()

Unnamed: 0,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,...,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_No,Churn_Yes
0,0,1,0,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,1,0
1,0,0,1,1,0,1,0,0,1,1,...,1,0,1,0,0,0,0,1,1,0
2,0,0,1,1,0,1,0,0,1,1,...,0,0,0,1,0,0,0,1,0,1
3,0,0,1,1,0,1,0,1,0,0,...,1,0,1,0,1,0,0,0,1,0
4,0,1,0,1,0,1,0,0,1,1,...,0,0,0,1,0,0,1,0,0,1


In [9]:
encoded_data = pd.concat([encoded_data, pd.DataFrame(cleaned_data[['tenure', 'MonthlyCharges', 'TotalCharges']])], axis=1)
encoded_data

Unnamed: 0,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,...,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_No,Churn_Yes,tenure,MonthlyCharges,TotalCharges
0,0,1,0,0,1,1,0,1,0,0,...,1,0,0,1,0,1,0,1,29.85,29.85
1,0,0,1,1,0,1,0,0,1,1,...,0,0,0,0,1,1,0,34,56.95,1889.5
2,0,0,1,1,0,1,0,0,1,1,...,1,0,0,0,1,0,1,2,53.85,108.15
3,0,0,1,1,0,1,0,1,0,0,...,0,1,0,0,0,1,0,45,42.30,1840.75
4,0,1,0,1,0,1,0,0,1,1,...,1,0,0,1,0,0,1,2,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,0,1,0,1,0,1,0,1,0,...,1,0,0,0,1,1,0,24,84.80,1990.5
7039,0,1,0,0,1,0,1,0,1,0,...,1,0,1,0,0,1,0,72,103.20,7362.9
7040,0,1,0,0,1,0,1,1,0,0,...,1,0,0,1,0,1,0,11,29.60,346.45
7041,1,0,1,0,1,1,0,0,1,0,...,1,0,0,0,1,0,1,4,74.40,306.6


#### Now, let's normalize
I'm normalizing the continuous values to between 0 and 1.  

In [10]:
final_data = encoded_data.copy()
final_data['tenure'] = final_data['tenure'] / final_data['tenure'].max()
final_data['tenure'].unique()

array([0.01388889, 0.47222222, 0.02777778, 0.625     , 0.11111111,
       0.30555556, 0.13888889, 0.38888889, 0.86111111, 0.18055556,
       0.22222222, 0.80555556, 0.68055556, 0.34722222, 0.95833333,
       0.72222222, 0.98611111, 0.29166667, 0.16666667, 0.41666667,
       0.65277778, 1.        , 0.23611111, 0.375     , 0.06944444,
       0.63888889, 0.15277778, 0.97222222, 0.875     , 0.59722222,
       0.20833333, 0.83333333, 0.25      , 0.91666667, 0.125     ,
       0.04166667, 0.43055556, 0.69444444, 0.88888889, 0.77777778,
       0.09722222, 0.58333333, 0.48611111, 0.66666667, 0.40277778,
       0.90277778, 0.52777778, 0.94444444, 0.44444444, 0.76388889,
       0.51388889, 0.5       , 0.56944444, 0.08333333, 0.05555556,
       0.45833333, 0.93055556, 0.31944444, 0.79166667, 0.84722222,
       0.19444444, 0.27777778, 0.73611111, 0.55555556, 0.81944444,
       0.33333333, 0.61111111, 0.26388889, 0.75      , 0.70833333,
       0.36111111, 0.        , 0.54166667])

In [11]:
final_data['MonthlyCharges'] = final_data.MonthlyCharges / final_data.MonthlyCharges.max()
final_data['MonthlyCharges'].unique()

array([0.25136842, 0.47957895, 0.45347368, ..., 0.53136842, 0.37221053,
       0.66273684])

In [12]:
final_data['TotalCharges'] = final_data['TotalCharges'].replace(' ', 0.)    # Get rid of ''
final_data['TotalCharges'] = final_data['TotalCharges'].astype(float)
final_data['TotalCharges'] = final_data.TotalCharges / final_data.TotalCharges.max()
final_data['TotalCharges'].unique()

array([0.00343704, 0.21756402, 0.01245279, ..., 0.03989153, 0.03530306,
       0.78810105])

In [13]:
final_data

Unnamed: 0,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,...,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_No,Churn_Yes,tenure,MonthlyCharges,TotalCharges
0,0,1,0,0,1,1,0,1,0,0,...,1,0,0,1,0,1,0,0.013889,0.251368,0.003437
1,0,0,1,1,0,1,0,0,1,1,...,0,0,0,0,1,1,0,0.472222,0.479579,0.217564
2,0,0,1,1,0,1,0,0,1,1,...,1,0,0,0,1,0,1,0.027778,0.453474,0.012453
3,0,0,1,1,0,1,0,1,0,0,...,0,1,0,0,0,1,0,0.625000,0.356211,0.211951
4,0,1,0,1,0,1,0,0,1,1,...,1,0,0,1,0,0,1,0.027778,0.595368,0.017462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,0,1,0,1,0,1,0,1,0,...,1,0,0,0,1,1,0,0.333333,0.714105,0.229194
7039,0,1,0,0,1,0,1,0,1,0,...,1,0,1,0,0,1,0,1.000000,0.869053,0.847792
7040,0,1,0,0,1,0,1,1,0,0,...,1,0,0,1,0,1,0,0.152778,0.249263,0.039892
7041,1,0,1,0,1,1,0,0,1,0,...,1,0,0,0,1,0,1,0.055556,0.626526,0.035303


### Now, we split into training/validation and labels

In [14]:
train_idx = int(len(final_data) * 0.8)
train_idx

5634

In [15]:
x_train = final_data[:train_idx].drop(['Churn_No', 'Churn_Yes'], axis=1)
x_train

Unnamed: 0,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,...,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
0,0,1,0,0,1,1,0,1,0,0,...,0,0,1,0,0,1,0,0.013889,0.251368,0.003437
1,0,0,1,1,0,1,0,0,1,1,...,0,1,0,0,0,0,1,0.472222,0.479579,0.217564
2,0,0,1,1,0,1,0,0,1,1,...,0,0,1,0,0,0,1,0.027778,0.453474,0.012453
3,0,0,1,1,0,1,0,1,0,0,...,0,1,0,1,0,0,0,0.625000,0.356211,0.211951
4,0,1,0,1,0,1,0,0,1,1,...,0,0,1,0,0,1,0,0.027778,0.595368,0.017462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,1,0,1,1,0,1,0,0,1,1,...,0,1,0,0,0,0,1,0.013889,0.168842,0.002309
5630,0,1,0,1,0,1,0,0,1,0,...,0,0,1,0,0,1,0,0.541667,0.871158,0.459936
5631,0,0,1,0,1,1,0,1,0,0,...,0,1,0,0,0,0,1,0.041667,0.210526,0.009010
5632,0,0,1,1,0,1,0,0,1,1,...,1,1,0,0,1,0,0,0.805556,0.170947,0.130285


In [16]:
y_train = final_data[:train_idx][['Churn_No','Churn_Yes']]
y_train

Unnamed: 0,Churn_No,Churn_Yes
0,1,0
1,1,0
2,0,1
3,1,0
4,0,1
...,...,...
5629,0,1
5630,0,1
5631,1,0
5632,1,0


In [17]:
x_val = final_data[train_idx:].drop(['Churn_Yes', 'Churn_No'], axis=1)
x_val

Unnamed: 0,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,...,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
5634,0,1,0,0,1,0,1,0,1,1,...,0,0,1,0,0,1,0,0.013889,0.167579,0.002291
5635,0,1,0,0,1,1,0,0,1,1,...,0,0,1,0,0,0,1,0.305556,0.460632,0.135726
5636,0,0,1,0,1,1,0,1,0,0,...,0,1,0,0,0,1,0,0.194444,0.390316,0.076881
5637,0,0,1,0,1,0,1,0,1,0,...,0,1,0,0,1,0,0,0.888889,0.760000,0.648161
5638,0,0,1,1,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0.013889,0.168000,0.002297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,0,1,0,1,0,1,0,1,0,...,0,0,1,0,0,0,1,0.333333,0.714105,0.229194
7039,0,1,0,0,1,0,1,0,1,0,...,0,0,1,0,1,0,0,1.000000,0.869053,0.847792
7040,0,1,0,0,1,0,1,1,0,0,...,0,0,1,0,0,1,0,0.152778,0.249263,0.039892
7041,1,0,1,0,1,1,0,0,1,0,...,0,0,1,0,0,0,1,0.055556,0.626526,0.035303


In [18]:
y_val = final_data[train_idx:][['Churn_Yes', 'Churn_No']]
y_val

Unnamed: 0,Churn_Yes,Churn_No
5634,1,0
5635,0,1
5636,1,0
5637,0,1
5638,1,0
...,...,...
7038,0,1
7039,0,1
7040,0,1
7041,1,0


# Tensor pre-processing pipeline
I won't be subclassing dataset/dataloaders. `[]`'s have __getitem__() and __len__() defined.

In [19]:
train_dataset = [(torch.tensor(example, device='cuda', dtype=torch.float), torch.tensor(label, device='cuda', dtype=torch.float)) for example, label in zip(x_train.values, y_train.values)]
train_dataset[:2]

[(tensor([0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 1.0000, 0.0000,
          0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0139, 0.2514, 0.0034],
         device='cuda:0'),
  tensor([1., 0.], device='cuda:0')),
 (tensor([0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000,
          1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000,
          1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.4722, 0.4796, 0.2176],
         device='cuda:0'),
  tensor([1., 0.], device='cuda:0'))]

In [20]:
val_dataset = [(torch.tensor(features, device='cuda', dtype=torch.float), torch.tensor(labels, device='cuda', dtype=torch.float)) for features, labels in zip(x_val.values, y_val.values)]
val_dataset[:2]

[(tensor([0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 1.0000,
          1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000,
          0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000,
          0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0139, 0.1676, 0.0023],
         device='cuda:0'),
  tensor([1., 0.], device='cuda:0')),
 (tensor([0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.0000, 1.0000,
          1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.3056, 0.4606, 0.1357],
         device='cuda:0'),
  tensor([0., 1.], device='cuda:0'))]

In [21]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=len(train_dataset) // 4, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=len(val_dataset) // 4, shuffle=True)

# Model Creation

In [220]:
model = nn.Sequential(
    nn.Linear(len(x_train.columns), len(x_train.columns) // 2, device='cuda'),
    nn.ReLU(),
    nn.Linear(len(x_train.columns) // 2, len(x_train.columns) // 4, device='cuda'),
    nn.ReLU(),
    nn.Linear(len(x_train.columns) // 4, 2, device='cuda')
)
loss_fcn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [61]:
def training_loop(epochs, model, loss_fcn, optimizer, train_dataloader, val_dataloader, metrics=True):
    """Training loop"""
    for epoch in range(epochs):
        for features, labels in train_dataloader:
            #features.unsqueeze_(1)
            #labels.unsqueeze_(1)
            train_predictions = model(features)
            if epoch == 0:
                print(f"Train_predictions : {train_predictions.shape}{train_predictions.dtype}\t labels: {labels.shape}{labels.dtype}")
            train_loss = loss_fcn(train_predictions, labels)
            
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
        
        total = 0
        correct = 0
        
        with torch.no_grad():
            for features, labels in val_dataloader:
                features.unsqueeze_(1)
                labels.unsqueeze_(1)
                val_predictions = model(features)
                val_loss = loss_fcn(val_predictions, labels)
                
                if metrics:
                    _, predicted = torch.max(val_predictions, dim=2)
                    predicted.squeeze_(1)
                    _, label_idx = torch.max(labels, dim=2)
                    label_idx.squeeze_(1)
                    total += labels.shape[0]
                    correct += int((predicted == label_idx).sum())
            
            print(f"\nEpoch: {epoch}\tTraining Loss: {train_loss}\tValidation loss: {val_loss}", end='')
            print(f"\tTotal: {total}, Correct: {correct}, Accuracy: {correct/total:%}", end='')
    
    return model


In [222]:
trained_model = training_loop(1, model, loss_fcn, optimizer, train_dataloader, val_dataloader)


Epoch: 0	Training Loss: 0.7008407711982727	Validation loss: 0.6639899611473083	Total: 1409, Correct: 1028, Accuracy: 72.959546%

In [223]:
torch.save(trained_model, './model_72_95')

## New model

In [66]:
in_count = len(x_train.columns)
model2 = nn.Sequential(
    nn.Linear(in_count, in_count // 2, device='cuda'),
    nn.ReLU(),
    nn.Linear(in_count//2, 2, device='cuda')
)

In [67]:
loss_fcn = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model2.parameters(), lr=1e-5)
trained_model2 = training_loop(100, model2, loss_fcn, optimizer, train_dataloader, val_dataloader)

Train_predictions : torch.Size([1408, 2])torch.float32	 labels: torch.Size([1408, 2])torch.float32
Train_predictions : torch.Size([1408, 2])torch.float32	 labels: torch.Size([1408, 2])torch.float32
Train_predictions : torch.Size([1408, 2])torch.float32	 labels: torch.Size([1408, 2])torch.float32
Train_predictions : torch.Size([1408, 2])torch.float32	 labels: torch.Size([1408, 2])torch.float32
Train_predictions : torch.Size([2, 2])torch.float32	 labels: torch.Size([2, 2])torch.float32

Epoch: 0	Training Loss: 0.7145718336105347	Validation loss: 0.6521032452583313	Total: 1409, Correct: 1029, Accuracy: 73.030518%
Epoch: 1	Training Loss: 0.6313053369522095	Validation loss: 0.834816575050354	Total: 1409, Correct: 1029, Accuracy: 73.030518%
Epoch: 2	Training Loss: 0.7558141946792603	Validation loss: 0.7003079652786255	Total: 1409, Correct: 1029, Accuracy: 73.030518%
Epoch: 3	Training Loss: 0.7601653337478638	Validation loss: 0.5916645526885986	Total: 1409, Correct: 1029, Accuracy: 73.030518%

# Let's pre-process the data differently

In [7]:
new_encoded_data = cleaned_data.copy()
new_encoded_data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
def print_unique(df):
    for col in df.columns:
        print(col, df[col].unique())
print_unique(new_encoded_data)

gender ['Female' 'Male']
SeniorCitizen [0 1]
Partner ['Yes' 'No']
Dependents ['No' 'Yes']
tenure [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService ['No' 'Yes']
MultipleLines ['No phone service' 'No' 'Yes']
InternetService ['DSL' 'Fiber optic' 'No']
OnlineSecurity ['No' 'Yes' 'No internet service']
OnlineBackup ['Yes' 'No' 'No internet service']
DeviceProtection ['No' 'Yes' 'No internet service']
TechSupport ['No' 'Yes' 'No internet service']
StreamingTV ['No' 'Yes' 'No internet service']
StreamingMovies ['No' 'Yes' 'No internet service']
Contract ['Month-to-month', 'One year', 'Two year']
Categories (3, object): ['Month-to-month' < 'One year' < 'Two year']
PaperlessBilling ['Yes' 'No']
PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharge

In [9]:
new_encoded_data['TechSupport'].replace('No internet service', 'No', inplace=True)
new_encoded_data['StreamingTV'].replace('No internet service', 'No', inplace=True)
new_encoded_data['MultipleLines'].replace('No phone service', 'No', inplace=True)
new_encoded_data['OnlineSecurity'].replace('No internet service', 'No', inplace=True)
new_encoded_data['OnlineBackup'].replace('No internet service', 'No', inplace=True)
new_encoded_data['DeviceProtection'].replace('No internet service', 'No', inplace=True)
new_encoded_data['StreamingMovies'].replace('No internet service', 'No', inplace=True)
print_unique(new_encoded_data)

gender ['Female' 'Male']
SeniorCitizen [0 1]
Partner ['Yes' 'No']
Dependents ['No' 'Yes']
tenure [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService ['No' 'Yes']
MultipleLines ['No' 'Yes']
InternetService ['DSL' 'Fiber optic' 'No']
OnlineSecurity ['No' 'Yes']
OnlineBackup ['Yes' 'No']
DeviceProtection ['No' 'Yes']
TechSupport ['No' 'Yes']
StreamingTV ['No' 'Yes']
StreamingMovies ['No' 'Yes']
Contract ['Month-to-month', 'One year', 'Two year']
Categories (3, object): ['Month-to-month' < 'One year' < 'Two year']
PaperlessBilling ['Yes' 'No']
PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn ['No' 'Yes']


In [10]:
new_encoded_data['gender'].replace({'Female': 0, 'Male': 1}, inplace=True)
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
           'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    new_encoded_data[col].replace({'Yes': 1, 'No': 0}, inplace=True)
print_unique(new_encoded_data)

gender [0 1]
SeniorCitizen [0 1]
Partner [1 0]
Dependents [0 1]
tenure [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService [0 1]
MultipleLines [0 1]
InternetService ['DSL' 'Fiber optic' 'No']
OnlineSecurity [0 1]
OnlineBackup [1 0]
DeviceProtection [0 1]
TechSupport [0 1]
StreamingTV [0 1]
StreamingMovies [0 1]
Contract ['Month-to-month', 'One year', 'Two year']
Categories (3, object): ['Month-to-month' < 'One year' < 'Two year']
PaperlessBilling [1 0]
PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn [0 1]


In [11]:
dummy_cols = ['InternetService', 'Contract', 'PaymentMethod']
new_encoded_data = pd.get_dummies(new_encoded_data, columns=dummy_cols)
print_unique(new_encoded_data)

gender [0 1]
SeniorCitizen [0 1]
Partner [1 0]
Dependents [0 1]
tenure [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService [0 1]
MultipleLines [0 1]
OnlineSecurity [0 1]
OnlineBackup [1 0]
DeviceProtection [0 1]
TechSupport [0 1]
StreamingTV [0 1]
StreamingMovies [0 1]
PaperlessBilling [1 0]
MonthlyCharges [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn [0 1]
InternetService_DSL [1 0]
InternetService_Fiber optic [0 1]
InternetService_No [0 1]
Contract_Month-to-month [1 0]
Contract_One year [0 1]
Contract_Two year [0 1]
PaymentMethod_Bank transfer (automatic) [0 1]
PaymentMethod_Credit card (automatic) [0 1]
PaymentMethod_Electronic check [1 0]
PaymentMethod_Mailed check [0 1]


#### The TotalCharges column contains strings

In [12]:
new_encoded_data = new_encoded_data[new_encoded_data.TotalCharges != ' ']
new_encoded_data.shape

(7032, 27)

In [13]:
new_encoded_data.TotalCharges = pd.to_numeric(new_encoded_data.TotalCharges)
new_encoded_data.TotalCharges.dtype

dtype('float64')

In [14]:
print_unique(new_encoded_data)

gender [0 1]
SeniorCitizen [0 1]
Partner [1 0]
Dependents [0 1]
tenure [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39]
PhoneService [0 1]
MultipleLines [0 1]
OnlineSecurity [0 1]
OnlineBackup [1 0]
DeviceProtection [0 1]
TechSupport [0 1]
StreamingTV [0 1]
StreamingMovies [0 1]
PaperlessBilling [1 0]
MonthlyCharges [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges [  29.85 1889.5   108.15 ...  346.45  306.6  6844.5 ]
Churn [0 1]
InternetService_DSL [1 0]
InternetService_Fiber optic [0 1]
InternetService_No [0 1]
Contract_Month-to-month [1 0]
Contract_One year [0 1]
Contract_Two year [0 1]
PaymentMethod_Bank transfer (automatic) [0 1]
PaymentMethod_Credit card (automatic) [0 1]
PaymentMethod_Electronic check [1 0]
PaymentMethod_Mailed check [0 1]


In [15]:
from sklearn.preprocessing import MinMaxScaler

In [16]:
mm_scaler = MinMaxScaler()
scale_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
new_encoded_data[scale_cols] = mm_scaler.fit_transform(new_encoded_data[scale_cols])
print_unique(new_encoded_data)

gender [0 1]
SeniorCitizen [0 1]
Partner [1 0]
Dependents [0 1]
tenure [0.         0.46478873 0.01408451 0.61971831 0.09859155 0.29577465
 0.12676056 0.38028169 0.85915493 0.16901408 0.21126761 0.8028169
 0.67605634 0.33802817 0.95774648 0.71830986 0.98591549 0.28169014
 0.15492958 0.4084507  0.64788732 1.         0.22535211 0.36619718
 0.05633803 0.63380282 0.14084507 0.97183099 0.87323944 0.5915493
 0.1971831  0.83098592 0.23943662 0.91549296 0.11267606 0.02816901
 0.42253521 0.69014085 0.88732394 0.77464789 0.08450704 0.57746479
 0.47887324 0.66197183 0.3943662  0.90140845 0.52112676 0.94366197
 0.43661972 0.76056338 0.50704225 0.49295775 0.56338028 0.07042254
 0.04225352 0.45070423 0.92957746 0.30985915 0.78873239 0.84507042
 0.18309859 0.26760563 0.73239437 0.54929577 0.81690141 0.32394366
 0.6056338  0.25352113 0.74647887 0.70422535 0.35211268 0.53521127]
PhoneService [0 1]
MultipleLines [0 1]
OnlineSecurity [0 1]
OnlineBackup [1 0]
DeviceProtection [0 1]
TechSupport [0 1]
Stream

In [None]:
from imblearn.over_sampling import SMOTE
x = new_encoded_data.drop('Churn', axis=1)
y = new_encoded_data['Churn'].copy()

oversample = SMOTE()
x, y = oversample.fit_resample(x, y)
print(f"x len: {len(x)}\ty len: {len(y)}")

In [20]:
split_idx = int(len(new_encoded_data) * 0.8)
x_train = new_encoded_data[:split_idx].drop('Churn', axis=1)
y_train = new_encoded_data[:split_idx]['Churn']
x_val = new_encoded_data[split_idx:].drop('Churn', axis=1)
y_val = new_encoded_data[split_idx:]['Churn']