In [48]:
import numpy as np
import pandas as pd
import boto3
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

#preprocessing 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 

#import libraries for bias and explainability
import sagemaker
from sagemaker import Session
from sagemaker import get_execution_role
from sagemaker import clarify

#library for privacy engine
from opacus import PrivacyEngine

#library for carbon emission
from codecarbon import EmissionsTracker


In [49]:
# Get the session, region and role
session = Session() 
region = session.boto_region_name 
role = get_execution_role() 
s3_client = boto3.client("s3") 

# Set up prefix for data and outputs
bucket = "responsibleai"
data_prefix = "chrun_prediction/data"
bias_prefix = "chrun_prediction/bias_explain" 

input_data_path = "s3://{}/{}".format(bucket, data_prefix)
explainability_output_path = "s3://{}/{}/clarify-explainability".format(bucket, bias_prefix) 
bias_report_output_path = "s3://{}/{}/clarify-bias".format(bucket, bias_prefix)

# initialize few params
csv_file = "../data/churn.csv"
batch_size = 50

print("Input data path = " + input_data_path)
print("Bias report path = " + bias_report_output_path)
print("Explainability report path = " + explainability_output_path)

Input data path = s3://responsibleai/chrun_prediction/data
Bias report path = s3://responsibleai/chrun_prediction/bias_explain/clarify-bias
Explainability report path = s3://responsibleai/chrun_prediction/bias_explain/clarify-explainability


In [53]:
df = pd.read_csv(csv_file)

df = df.drop(["Surname", "CustomerId", "RowNumber"], axis=1)

# Grouping variable names
categorical = ["Geography", "Gender"]
target = "Exited"

# One-hot encoding of categorical variables
training_data = pd.get_dummies(df, prefix=categorical)

churn_features = training_data.drop(['Exited'], axis = 1)
churn_label = training_date['Exited']

scaler = StandardScaler()
X_array  = scaler.fit_transform(churn_features)
X = pd.DataFrame(X_array, columns = churn_features.columns)

#churn_train, churn_test = train_test_split (training_data, test_size=0.2)

print(churn_features.head())
print(X.head())
#churn_train = pd.concat([churn_train["Exited"], churn_train.drop(["Exited"], axis=1)], axis=1)

#X = churn_frame.drop(target, axis=1)
#y = churn_frame["Exited"]


   CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          619   42       2       0.00              1          1   
1          608   41       1   83807.86              1          0   
2          502   42       8  159660.80              3          1   
3          699   39       1       0.00              2          0   
4          850   43       2  125510.82              1          1   

   IsActiveMember  EstimatedSalary  Geography_France  Geography_Germany  \
0               1        101348.88                 1                  0   
1               1        112542.58                 0                  0   
2               0        113931.57                 1                  0   
3               0         93826.63                 1                  0   
4               1         79084.10                 0                  0   

   Geography_Spain  Gender_Female  Gender_Male  
0                0              1            0  
1                1              1         

In [36]:
class ChurnDataSet(Dataset):
 
    def __init__(self, df_features, df_target):
        self.X = df_features
        self.y = df_target
        
        scaler = StandardScaler()
        X_array  = scaler.fit_transform(self.X)
        self.X = pd.DataFrame(X_array)

    def __len__(self):
        return len(self.churn_frame)

    def __getitem__(self, idx):
        # Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
        if isinstance(idx, torch.Tensor):
            idx = idx.tolist()

        return [self.X.iloc[idx].values, self.y[idx]]

In [37]:
def PrepareDataSet(csv_file):   
        df = pd.read_csv(csv_file)
        
        df = df.drop(["Surname", "CustomerId", "RowNumber"], axis=1)

        # Grouping variable names
        categorical = ["Geography", "Gender"]
        target = "Exited"

        # One-hot encoding of categorical variables
        churn_frame = pd.get_dummies(df, prefix=categorical)

        # Save target and predictors
        X = churn_frame.drop(target, axis=1)
        y = churn_frame["Exited"]
        
        return X, y

In [38]:
def get_CHURN_model():
    model = nn.Sequential(nn.Linear(13, 64), 
                    nn.ReLU(), 
                    nn.Linear(64, 64), 
                    nn.ReLU(), 
                    nn.Linear(64, 1)) 
    return model

In [39]:
def get_dataloader(csv_file, batch_size):
    
    # Split into training and test
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    trainset, testset = random_split(dataset, [train_size, test_size])
    
    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=True)
    
    return trainloader, testloader, trainset, testset

In [40]:
def train(trainloader, net, optimizer, n_epochs=100):
     
    device = "cpu"

    # Define the model
    #net = get_CHURN_model()
    net = net.to(device)
    
    #criterion = nn.CrossEntropyLoss() 
    criterion = nn.BCEWithLogitsLoss()

    # Train the net
    loss_per_iter = []
    loss_per_batch = []
    for epoch in range(n_epochs):

        running_loss = 0.0
        for inputs, labels in trainloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            outputs = net(inputs.float())
            loss = criterion(outputs, labels.float().unsqueeze(1))
            loss.backward()
            optimizer.step()

            # Save loss to plot
            running_loss += loss.item()
            loss_per_iter.append(loss.item())

        
        print("Epoch {} - Training loss: {}".format(epoch, running_loss/len(trainloader))) 
        
        running_loss = 0.0
        
    return net

NameError: name 'boto3' is not defined

In [44]:

# prepare dataset
X, y = PrepareDataSet(csv_file)


#train and test split
#churn_train, churn_test = train_test_split (X, test_size=0.2)
#churn_train.to_csv("../data/train_churn.csv", index=False, header=False)


# convert dataset for pytorch
#trainloader, testloader, train_ds, test_ds = get_dataloader(csv_file, batch_size)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.326221,0.293517,-1.04176,-1.225848,-0.911583,0.646092,0.970243,0.021886,0.997204,-0.578736,-0.573809,1.095988,-1.095988
1,-0.440036,0.198164,-1.387538,0.11735,-0.911583,-1.547768,0.970243,0.216534,-1.002804,-0.578736,1.74274,1.095988,-1.095988
2,-1.536794,0.293517,1.032908,1.333053,2.527057,0.646092,-1.03067,0.240687,0.997204,-0.578736,-0.573809,1.095988,-1.095988
3,0.501521,0.007457,-1.387538,-1.225848,0.807737,-1.547768,-1.03067,-0.108918,0.997204,-0.578736,-0.573809,1.095988,-1.095988
4,2.063884,0.388871,-1.04176,0.785728,-0.911583,0.646092,0.970243,-0.365276,-1.002804,-0.578736,1.74274,1.095988,-1.095988


In [18]:

net = get_CHURN_model()
optimizer = optim.Adam(net.parameters(), weight_decay=0.0001, lr=0.003)
model = train(trainloader, net, optimizer, 50)

[2022-02-28 15:21:54.261 pytorch-1-8-gpu-py3-ml-g4dn-xlarge-60bd0d07a83be181dcf7335baae2:1222 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2022-02-28 15:21:54.294 pytorch-1-8-gpu-py3-ml-g4dn-xlarge-60bd0d07a83be181dcf7335baae2:1222 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
Epoch 0 - Training loss: 0.4253511395305395
Epoch 1 - Training loss: 0.3550588250160217
Epoch 2 - Training loss: 0.3438827622681856
Epoch 3 - Training loss: 0.3400544809643179
Epoch 4 - Training loss: 0.3367278458550572
Epoch 5 - Training loss: 0.3320686207152903
Epoch 6 - Training loss: 0.33003695383667947
Epoch 7 - Training loss: 0.3285841692239046
Epoch 8 - Training loss: 0.32846663082018496
Epoch 9 - Training loss: 0.32431351514533163
Epoch 10 - Training loss: 0.3216641909442842
Epoch 11 - Training loss: 0.3224272786639631
Epoch 12 - Training loss: 0.3198192465119064
Epoch 13 - Training loss: 0.31969332657754423
Epoch 14 

In [10]:
max_per_sample_grad_norm = 1.5
sample_rate = batch_size/len(train_ds)
noise_multiplier = 0.8

In [11]:
tracker = EmissionsTracker(project_name = "churn_prediction",
                           output_dir = "../output/",
                           measure_power_secs = 15,
                           save_to_file = True)

tracker.start()

net = get_CHURN_model()

optimizer = optim.Adam(net.parameters(), weight_decay=0.0001, lr=0.003)

privacy_engine = PrivacyEngine(
    net,
    max_grad_norm=max_per_sample_grad_norm,
    noise_multiplier = noise_multiplier,
    sample_rate = sample_rate,
)

privacy_engine.attach(optimizer)

model = train(trainloader, net, optimizer, batch_size)

emissions: float = tracker.stop()

  "A ``sample_rate`` has been provided."
  "Secure RNG turned off. This is perfectly fine for experimentation as it allows "


Epoch 0 - Training loss: 0.5739273639395833
Epoch 1 - Training loss: 0.5450084384530782
Epoch 2 - Training loss: 0.5399808972142637
Epoch 3 - Training loss: 0.5380573300644755
Epoch 4 - Training loss: 0.5396806959062814
Epoch 5 - Training loss: 0.5379347333684563
Epoch 6 - Training loss: 0.5379639642313123
Epoch 7 - Training loss: 0.5341536544263363
Epoch 8 - Training loss: 0.5255999676883221
Epoch 9 - Training loss: 0.527379980077967
Epoch 10 - Training loss: 0.5191250344272703
Epoch 11 - Training loss: 0.5165148722939193
Epoch 12 - Training loss: 0.5069711779244244
Epoch 13 - Training loss: 0.5001096079126001
Epoch 14 - Training loss: 0.5008684289641678
Epoch 15 - Training loss: 0.4996961490251124
Epoch 16 - Training loss: 0.49414023593999445
Epoch 17 - Training loss: 0.4997370705939829
Epoch 18 - Training loss: 0.5062210355419665
Epoch 19 - Training loss: 0.5157987097278237
Epoch 20 - Training loss: 0.5177786984480918
Epoch 21 - Training loss: 0.5228384896647185
Epoch 22 - Training 

In [12]:
print("**** Differential Privacy *******")
epsilon, best_alpha = privacy_engine.get_privacy_spent()
print (f" ε = {epsilon:.2f}, δ = {privacy_engine.target_delta}")

print("**** Emissions Information*******")
print(emissions)

 ε = 6.39, δ = 1e-06


In [None]:
train_uri = S3Uploader.upload("../data/train_churn.csv", "s3://{}/{}".format(bucket, data_prefix))

In [None]:
clarify_processor = clarify.SageMakerClarifyProcessor( 
                        role=role, 
                        instance_count=1, 
                        instance_type="ml.m5.xlarge", 
                        sagemaker_session=session) 

In [None]:
bias_data_config = clarify.DataConfig( 
    s3_data_input_path=train_uri, 
    s3_output_path=bias_report_output_path, 
    label="Exited", 
    headers=churn_train.columns.to_list(), 
    dataset_type="text/csv") 

model_config = clarify.ModelConfig( 
    model_name=model_name, 
    instance_type="ml.m5.xlarge", 
    instance_count=1,
    accept_type="text/csv", 
    content_type="text/csv",) 

predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.8)

bias_config = clarify.BiasConfig( 
    label_values_or_threshold=[1], 
    facet_name="Gender", 
    facet_values_or_threshold=[0]) 

In [None]:
clarify_processor.run_bias( 
    data_config=bias_data_config, 
    bias_config=bias_config, 
    model_config=model_config, 
    model_predicted_label_config=predictions_config, 
    pre_training_methods="all", 
    post_training_methods="all") 