<a href="https://colab.research.google.com/github/MathildaAsemota/Privacy/blob/main/PrivacyPreserving_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split  # For splitting the data
from torch.utils.data import SubsetRandomSampler, DataLoader  # For sampling and DataLoader
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
n_new_classes = 2
n_epochs = 40
new_batch_size = 64
learning_rate = 0.0001
n_features = 28 * 28
n_hidden = 100


In [3]:
train_data = torchvision.datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=transforms.ToTensor(),
)

test_data = torchvision.datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=transforms.ToTensor(),
)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 111] Connection refused>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 22.6MB/s]


Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 111] Connection refused>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 651kB/s]


Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 111] Connection refused>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 5.61MB/s]


Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 111] Connection refused>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 4.43MB/s]

Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw






In [4]:
class YourSampler(torch.utils.data.sampler.Sampler):
    def __init__(self, mask, data_source):
        self.mask = mask
        self.data_source = data_source

    def __iter__(self):
        return iter([i.item() for i in torch.nonzero(mask)])

    def __len__(self):
        return len(self.data_source)

mnist = torchvision.datasets.MNIST(root="data", download=True, transform=transforms.ToTensor())
mask = [1 if mnist[i][1] == 0 or mnist[i][1] == 1 else 0 for i in range(len(mnist))]
mask = torch.tensor(mask)
sampler = YourSampler(mask, mnist)
new_data = torch.utils.data.DataLoader(mnist,sampler = sampler, shuffle=False)



In [5]:
all_images = []
all_labels = []

for images, labels in new_data:
    all_images.append(images)
    all_labels.append(labels)

all_images = torch.cat(all_images)
all_labels = torch.cat(all_labels)


In [6]:
import pandas as pd

data_list = []
for batch_data, batch_labels in new_data:
    batch_data = batch_data.view(batch_data.size(0), -1).numpy()
    batch_labels = batch_labels.numpy()
    data_list.extend(zip(batch_data, batch_labels))

full_data = pd.DataFrame(data_list, columns=['x', 'y'])


In [7]:
batch_size = 10
p = 0.7
q = 0.9

batch_index = 0

full_data['Batch'] = -1

while(True):
    prop = p if (batch_index % 2 == 0) else q
    num_class_a_elts = int(batch_size * prop)
    num_class_b_elts = batch_size - num_class_a_elts

    class_a_unassigned = full_data[(full_data['y'] == 0) & (full_data['Batch'] == -1)]
    class_b_unassigned = full_data[(full_data['y'] == 1) & (full_data['Batch'] == -1)]

    if class_a_unassigned.shape[0] < num_class_a_elts or \
        class_b_unassigned.shape[0] < num_class_b_elts:
        break
    else:
        full_data.loc[class_a_unassigned.sample(num_class_a_elts).index, 'Batch'] = batch_index
        full_data.loc[class_b_unassigned.sample(num_class_b_elts).index, 'Batch'] = batch_index
        batch_index += 1

In [8]:
test_size = 0.2
full_data = full_data[full_data['Batch'] != -1]

num_batches = full_data['Batch'].nunique()
print("Number of batches:", num_batches)

num_test_batches = int(num_batches * test_size)
num_train_batches = num_batches - num_test_batches

training_data = full_data[full_data['Batch'] < num_train_batches]
test_data = full_data[full_data['Batch'] >= num_train_batches]

num_batches = training_data['Batch'].nunique()
print("Number of training batches:", num_batches)

num_t_batches = test_data['Batch'].nunique()
print("Number of testing batches:", num_t_batches)

input_col = ['x']



Number of batches: 740
Number of training batches: 592
Number of testing batches: 148


In [9]:
training_data_batches = training_data.groupby('Batch')


In [10]:
class NeuralNetwork(nn.Module):
    def __init__(self, n_features: int, n_hidden: int, n_new_classes: int) -> None:
        super(NeuralNetwork, self).__init__()
        self.h1 = nn.Linear(n_features, n_hidden)
        self.h2 = nn.Linear(n_hidden, n_hidden)
        self.h3 = nn.Linear(n_hidden, n_hidden)
        self.out = nn.Linear(n_hidden, n_new_classes)

    def forward(self, x):
        out = torch.relu(self.h1(x))
        out = torch.relu(self.h2(out))
        out = torch.relu(self.h3(out))
        out = self.out(out)
        return out

In [11]:
class LossFunction(nn.Module):
    def __init__(self):
        super(LossFunction, self).__init__()

    def forward(self, predictions, mean_l):
        mean_p = torch.mean(predictions)
        mean_l = torch.tensor(mean_l, dtype=torch.float32)

        return torch.square(mean_p - mean_l)

In [12]:
device = torch.device('cpu')
model = NeuralNetwork(n_features, n_hidden, n_new_classes).to(device)

loss_fn = LossFunction()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [13]:
input_data = torch.tensor(test_data['x'].tolist(), dtype=torch.float32)


  input_data = torch.tensor(test_data['x'].tolist(), dtype=torch.float32)


In [14]:
predictions = model(input_data).detach().numpy()
predictions = np.where(predictions < 0.5, 1, 0)
predictions = predictions[:, 0]

predictions[0:5]

array([1, 1, 1, 1, 1])

In [15]:
class_0_data = test_data[test_data['y'] == 1]
input_data_class_0 = torch.tensor(class_0_data['x'].tolist(), dtype=torch.float32)
predictions_class_0 = model(input_data_class_0).detach().numpy()
mu = np.mean(predictions_class_0)


In [16]:
targets = test_data['y']

In [17]:
def train(epoch):

    model.train()
    loss_epoch = []

    for num, batch in training_data_batches:

        prop = batch['y'].value_counts()[0] / batch.shape[0]

        model_input = torch.tensor(batch['x'].tolist(), dtype=torch.float32)

        output = model(model_input)

        loss = loss_fn(output, prop)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loss_epoch.append(loss.detach().numpy())

    input_data = torch.tensor(test_data['x'].tolist(), dtype=torch.float32)

    predictions = model(input_data).detach().numpy()
    predictions = np.where(predictions < 0.5, 1, 0)
    predictions = predictions[:, 0]

    targets = test_data['y']
    test_acc = accuracy_score(targets, predictions)

    class_0_data = test_data[test_data['y'] == 1]
    input_data_class_0 = torch.tensor(class_0_data['x'].tolist(), dtype=torch.float32)
    predictions_class_0 = model(input_data_class_0).detach().numpy()
    mu = np.mean(predictions_class_0)

    class_1_data = test_data[test_data['y'] == 0]
    input_data_class_1 = torch.tensor(class_1_data['x'].tolist(), dtype=torch.float32)
    predictions_class_1 = model(input_data_class_1).detach().numpy()
    nu = np.mean(1 - predictions_class_1)

    return loss_epoch, test_acc, mu, nu





In [18]:
loss_history = []
test_history = []
mu_history = []
nu_history = []

for epoch in range(1, n_epochs + 1):
    loss_epoch, test_acc, mu, nu = train(epoch)
    print("Epoch:", epoch, "Loss:", np.mean(loss_epoch), "Test Accuracy:", test_acc, "mu, nu:", mu, nu)
    loss_history.append(loss_epoch)
    test_history.append(test_acc)
    mu_history.append(mu)
    nu_history.append(nu)

Epoch: 1 Loss: 0.055508144 Test Accuracy: 0.9736486486486486 mu, nu: 0.26675886 0.043991048
Epoch: 2 Loss: 0.004855336 Test Accuracy: 0.9871621621621621 mu, nu: 0.16907014 0.016604215
Epoch: 3 Loss: 0.004311548 Test Accuracy: 0.9905405405405405 mu, nu: 0.11838565 0.00975458
Epoch: 4 Loss: 0.0038837888 Test Accuracy: 0.9932432432432432 mu, nu: 0.100011386 0.011848529
Epoch: 5 Loss: 0.0034370385 Test Accuracy: 0.9945945945945946 mu, nu: 0.09048686 0.016035877
Epoch: 6 Loss: 0.0029478294 Test Accuracy: 0.995945945945946 mu, nu: 0.08165192 0.020407155
Epoch: 7 Loss: 0.002436045 Test Accuracy: 0.9966216216216216 mu, nu: 0.07294877 0.022174146
Epoch: 8 Loss: 0.0019336392 Test Accuracy: 0.9979729729729729 mu, nu: 0.068634726 0.020000666
Epoch: 9 Loss: 0.001515352 Test Accuracy: 0.9993243243243243 mu, nu: 0.06404601 0.016249107
Epoch: 10 Loss: 0.0011867757 Test Accuracy: 1.0 mu, nu: 0.05820618 0.012832006
Epoch: 11 Loss: 0.00094679015 Test Accuracy: 1.0 mu, nu: 0.051276494 0.010529151
Epoch: 1