## Data Preprocessing

In [1]:
# Imports
import numpy as np
import pandas as pd
# import dcMinMaxFunctions as dc
# import dcor
from scipy.misc import derivative
from sklearn.model_selection import train_test_split
import math
import torch
from scipy import stats
import wandb
import torch

In [2]:
df=pd.read_csv("data/Churn_Modelling.csv")

In [3]:
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [4]:
df=pd.read_csv("data/Churn_Modelling.csv")
# df=df.drop(['duration', 'pdays'],axis=1) # duration gives away the answer, and pdays has too much missing info

X = df.loc[:, df.columns != 'Exited'].replace(dict(yes=True, no=False))
Y = df.loc[:, ['Exited']].replace(dict(yes=True, no=False))

In [5]:
categorical_columns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
outputs = ['Exited']

In [6]:
for category in categorical_columns:
    df[category] = df[category].astype('category')

In [7]:
def OHE(x):
    dim = np.max(x)
    y = np.zeros((len(x),dim+1))
    for i in range(len(x)):
        y[i][x[i]] = 1
    return(y)

In [8]:
geo = OHE(df['Geography'].cat.codes.values)
gen =  np.asarray(df['Gender'].cat.codes.values)
hcc =  np.asarray(df['HasCrCard'].cat.codes.values)
iam =  np.asarray(df['IsActiveMember'].cat.codes.values)

categorical_data = np.stack(( gen, hcc, iam), axis=1)
# categorical_data = torch.tensor(categorical_data, dtype=torch.int64)
numerical_data = np.stack([df[col].values for col in numerical_columns], 1)
# numerical_data = torch.tensor(numerical_data, dtype=torch.float)
X = np.concatenate((numerical_data, categorical_data,geo), axis=1)
Y = df[outputs].values
# outputs = torch.tensor(df[outputs].values).flatten()

In [9]:
def normalize(x):
    x_normed = x / x.max(0, keepdim=True)[0]
    return x_normed



In [10]:
X = torch.Tensor(X)
Y = torch.Tensor(Y)
X = normalize(X)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [22]:
def accuracy(net,X_test,Y_test):
    correct = 0
    total = 0
    with torch.no_grad():
        outputs = net(X_test)
        predicted = (outputs > 0.5).float()
        total += Y_test.size(0)
        correct += (predicted == Y_test).sum().item()
    return(100 * correct / total)


def train_model(net,trainloader,optimizer,epochs,rate = 10,device= torch.device('cpu'),print_cond = True,privacy_engine = None):
    criterion = nn.BCELoss(reduction= 'none')
    counter =0
    for epoch in range(epochs):  # loop over the dataset multiple times
        running_loss = 0.0
    
        # data = data.to(device)
        net = net.to(device)
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            
            inputs = data[0].to(device)
            labels = data[1].to(device)
          
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs,labels)
            loss.backward(torch.ones_like(loss))
            optimizer.step()

            # print statistics
            # print(loss.sum().shape)
            running_loss += loss.sum()
            counter+=1
        DELTA = 1e-4
        if (epoch+1) % rate == 0:
            if(privacy_engine != None):
                epsilon = privacy_engine.get_epsilon(DELTA)
            else:
                epsilon = 0
            print(
                f"\tTrain Epoch: {epoch} \t"
                f"Loss: {(running_loss/counter):.6f} "
                # f"Acc@1: {np.mean(top1_acc) * 100:.6f} "
                f"(ε = {epsilon:.2f}, δ = {DELTA})"
                
                )
            print(accuracy(net,X_train.to('cuda'),Y_train.to('cuda')))
            print(accuracy(net,X_test.to('cuda'),Y_test.to('cuda')))
            print(accuracy(net,X.to('cuda'),Y.to('cuda')))
            
            counter = 0
            # if i % 100 == 99:    # print every 2000 mini-batches
        # if(epoch%rate==rate-1):
        #     if(print_cond):
        #         print('[%d, %5d] loss: %.10f' %
        #                 (epoch + 1,0, running_loss / 100))
        #         running_loss = 0.0
            # if(privacy_engine != None):
                
            #     eps_val.append(privacy_engine.get_epsilon(delta))
            #     acc.append(accuracy(net,X_test,Y_test))


    print('Finished Training')


def load_data(path):
    df=pd.read_csv("data/Churn_Modelling.csv")
    X = df.loc[:, df.columns != 'Exited'].replace(dict(yes=True, no=False))
    Y = df.loc[:, ['Exited']].replace(dict(yes=True, no=False))
    categorical_columns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
    numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    outputs = ['Exited']
    for category in categorical_columns:
        df[category] = df[category].astype('category')
    geo = OHE(df['Geography'].cat.codes.values)
    gen =  np.asarray(df['Gender'].cat.codes.values)
    hcc =  np.asarray(df['HasCrCard'].cat.codes.values)
    iam =  np.asarray(df['IsActiveMember'].cat.codes.values)

    categorical_data = np.stack(( gen, hcc, iam), axis=1)
    # categorical_data = torch.tensor(categorical_data, dtype=torch.int64)
    numerical_data = np.stack([df[col].values for col in numerical_columns], 1)
    # numerical_data = torch.tensor(numerical_data, dtype=torch.float)
    X = np.concatenate((numerical_data, categorical_data,geo), axis=1)
    Y = df[outputs].values
    # outputs = torch.tensor(df[outputs].values).flatten()
    X = torch.Tensor(X)
    Y = torch.Tensor(Y)
    X = normalize(X)
    return X,Y
def normalize(x):
    x_normed = x / x.max(0, keepdim=True)[0]
    return x_normed



def OHE(x):
    dim = max(x)
    y = np.zeros((len(x),dim+1))
    for i in range(len(x)):
        y[i][x[i]] = 1
    return(y)

In [13]:
X,Y = load_data("data/Churn_Modelling.csv")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

train = torch.utils.data.TensorDataset(X_train,Y_train)

trainloader = torch.utils.data.DataLoader(train, batch_size=1000,
                                        shuffle=True, num_workers=2)

In [18]:
from opacus.validators import ModuleValidator
from opacus import PrivacyEngine
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

        
model2 = nn.Sequential(
    nn.Linear(12, 48),
    nn.ReLU(),
    nn.Linear(48, 64),
    nn.ReLU(),
    nn.Linear(64, 1),
    nn.Sigmoid(

    )
)
optim2 = torch.optim.Adam(model2.parameters(),lr=0.01,weight_decay=1e-4)

train_model(model2,trainloader,optim2,1000,100,device=torch.device('cuda'))

86.8375
86.4
86.75
87.8125
87.1
87.67
88.0625
86.35
87.72
88.4
85.8
87.88
88.75
85.85
88.17
88.45
85.4
87.84
89.225
85.6
88.5
89.225
85.95
88.57
89.8
85.45
88.93
90.5125
84.95
89.4
Finished Training


In [25]:
model= nn.Sequential(
            nn.Linear(12, 32),
            nn.ReLU(),
            # nn.Linear(48, 32),
            # nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid(

            )
        )
model.to('cuda')
print(accuracy(model,X_test.to('cuda'),Y_test.to('cuda')))
print(accuracy(model,X_train.to('cuda'),Y_train.to('cuda')))
from opacus import PrivacyEngine
privacy_engine = PrivacyEngine()


target_eps = 1.0
optim = torch.optim.Adam(model.parameters(),lr=0.01,weight_decay=1e-4)
model, optimizer, train_loader = privacy_engine.make_private_with_epsilon(
    
    module=model,
    optimizer=optim,
    data_loader=trainloader,
    epochs=100,
    target_epsilon=target_eps,
    target_delta= 1e-4,
    max_grad_norm=3.0,
)

train_model(model,train_loader,optimizer,100,10,device=torch.device('cuda'),privacy_engine=privacy_engine)


61.7
59.825


  z = np.log((np.exp(t) + q - 1) / q)


	Train Epoch: 9 	Loss: 178.096024 (ε = 0.28, δ = 0.0001)
79.55
79.95
79.63
	Train Epoch: 19 	Loss: 189.982193 (ε = 0.41, δ = 0.0001)
79.55
79.95
79.63
	Train Epoch: 29 	Loss: 184.547592 (ε = 0.51, δ = 0.0001)
79.55
79.95
79.63
	Train Epoch: 39 	Loss: 184.139572 (ε = 0.60, δ = 0.0001)
79.55
79.95
79.63


KeyboardInterrupt: 