In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import accuracy_score

#library for privacy engine
from opacus import PrivacyEngine

#library for carbon emission
from codecarbon import EmissionsTracker


In [3]:
batch_size = 50

In [4]:
class ChurnDataset(Dataset):
 
    def __init__(self, csv_file):
  
        df = pd.read_csv(csv_file)
        
        df = df.drop(["Surname", "CustomerId", "RowNumber"], axis=1)

        # Grouping variable names
        self.categorical = ["Geography", "Gender"]
        self.target = "Exited"

        # One-hot encoding of categorical variables
        self.churn_frame = pd.get_dummies(df, prefix=self.categorical)

        # Save target and predictors
        self.X = self.churn_frame.drop(self.target, axis=1)
        self.y = self.churn_frame["Exited"]
        
        
        scaler = StandardScaler()
        X_array  = scaler.fit_transform(self.X)
        self.X = pd.DataFrame(X_array)

    def __len__(self):
        return len(self.churn_frame)

    def __getitem__(self, idx):
        # Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
        if isinstance(idx, torch.Tensor):
            idx = idx.tolist()

        return [self.X.iloc[idx].values, self.y[idx]]

In [5]:
def get_CHURN_model():
    model = nn.Sequential(nn.Linear(13, 64), 
                    nn.ReLU(), 
                    nn.Linear(64, 64), 
                    nn.ReLU(), 
                    nn.Linear(64, 1)) 
    return model

In [6]:
def get_dataloader(csv_file, batch_size):
     # Load dataset
    dataset = ChurnDataset(csv_file)

    # Split into training and test
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    trainset, testset = random_split(dataset, [train_size, test_size])
    
    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
    
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=True)
    
    return trainloader, testloader, trainset, testset

In [7]:
def train(trainloader, net, optimizer, n_epochs=100):
     
    device = "cpu"

    # Define the model
    #net = get_CHURN_model()
    net = net.to(device)
    
    #criterion = nn.CrossEntropyLoss() 
    criterion = nn.BCEWithLogitsLoss()


    # Train the net
    loss_per_iter = []
    loss_per_batch = []
    for epoch in range(n_epochs):

        running_loss = 0.0
        for inputs, labels in trainloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            outputs = net(inputs.float())
            loss = criterion(outputs, labels.float().unsqueeze(1))
            loss.backward()
            optimizer.step()

            # Save loss to plot
            running_loss += loss.item()
            loss_per_iter.append(loss.item())

        
        print("Epoch {} - Training loss: {}".format(epoch, running_loss/len(trainloader))) 
        
        running_loss = 0.0
        
    return net

In [8]:
csv_file = "../data/churn.csv"

trainloader, testloader, train_ds, test_ds = get_dataloader(csv_file, batch_size)

In [9]:


net = get_CHURN_model()
optimizer = optim.Adam(net.parameters(), weight_decay=0.0001, lr=0.003)
model = train(trainloader, net, optimizer, 50)

[2022-02-25 23:46:15.238 pytorch-1-8-gpu-py3-ml-g4dn-xlarge-60bd0d07a83be181dcf7335baae2:825 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2022-02-25 23:46:15.271 pytorch-1-8-gpu-py3-ml-g4dn-xlarge-60bd0d07a83be181dcf7335baae2:825 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
Epoch 0 - Training loss: 0.44410876017063855
Epoch 1 - Training loss: 0.3670849752612412
Epoch 2 - Training loss: 0.3505032489076257
Epoch 3 - Training loss: 0.34592474773526194
Epoch 4 - Training loss: 0.34161686031147837
Epoch 5 - Training loss: 0.3403101363219321
Epoch 6 - Training loss: 0.33542798142880204
Epoch 7 - Training loss: 0.33245804691687225
Epoch 8 - Training loss: 0.3322856665588915
Epoch 9 - Training loss: 0.33103214274160564
Epoch 10 - Training loss: 0.32561915488913656
Epoch 11 - Training loss: 0.3244193574879318
Epoch 12 - Training loss: 0.3228849684819579
Epoch 13 - Training loss: 0.3208110096864402
Epoch 14

In [10]:
max_per_sample_grad_norm = 1.5
sample_rate = batch_size/len(train_ds)
noise_multiplier = 0.8

In [11]:
tracker = EmissionsTracker(project_name = "churn_prediction",
                           output_dir = "../output/",
                           measure_power_secs = 15,
                           save_to_file = True)

tracker.start()

net = get_CHURN_model()

optimizer = optim.Adam(net.parameters(), weight_decay=0.0001, lr=0.003)

privacy_engine = PrivacyEngine(
    net,
    max_grad_norm=max_per_sample_grad_norm,
    noise_multiplier = noise_multiplier,
    sample_rate = sample_rate,
)

privacy_engine.attach(optimizer)

model = train(trainloader, net, optimizer, batch_size)

emissions: float = tracker.stop()

  "A ``sample_rate`` has been provided."
  "Secure RNG turned off. This is perfectly fine for experimentation as it allows "


Epoch 0 - Training loss: 0.5739273639395833
Epoch 1 - Training loss: 0.5450084384530782
Epoch 2 - Training loss: 0.5399808972142637
Epoch 3 - Training loss: 0.5380573300644755
Epoch 4 - Training loss: 0.5396806959062814
Epoch 5 - Training loss: 0.5379347333684563
Epoch 6 - Training loss: 0.5379639642313123
Epoch 7 - Training loss: 0.5341536544263363
Epoch 8 - Training loss: 0.5255999676883221
Epoch 9 - Training loss: 0.527379980077967
Epoch 10 - Training loss: 0.5191250344272703
Epoch 11 - Training loss: 0.5165148722939193
Epoch 12 - Training loss: 0.5069711779244244
Epoch 13 - Training loss: 0.5001096079126001
Epoch 14 - Training loss: 0.5008684289641678
Epoch 15 - Training loss: 0.4996961490251124
Epoch 16 - Training loss: 0.49414023593999445
Epoch 17 - Training loss: 0.4997370705939829
Epoch 18 - Training loss: 0.5062210355419665
Epoch 19 - Training loss: 0.5157987097278237
Epoch 20 - Training loss: 0.5177786984480918
Epoch 21 - Training loss: 0.5228384896647185
Epoch 22 - Training 

In [12]:
print("**** Differential Privacy *******")
epsilon, best_alpha = privacy_engine.get_privacy_spent()
print (f" ε = {epsilon:.2f}, δ = {privacy_engine.target_delta}")

print("**** Emissions Information*******")
print(emissions)

 ε = 6.39, δ = 1e-06
