<a href="https://colab.research.google.com/github/MathildaAsemota/Privacy/blob/main/Mnist3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Imports
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split  # For splitting the data
from torch.utils.data import SubsetRandomSampler, DataLoader  # For sampling and DataLoader
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd



# Device Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
n_new_classes = 2   # 0 and 1
n_epochs = 40
new_batch_size = 64
learning_rate = 0.0001
n_features = 28 * 28
n_hidden = 100


In [None]:
# Download training data from open datasets.
train_data = torchvision.datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=transforms.ToTensor(),
)

# Download test data from open datasets.
test_data = torchvision.datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=transforms.ToTensor(),
)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 39.1MB/s]


Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 1.26MB/s]


Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 10.8MB/s]


Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 8.07MB/s]

Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw






In [None]:
class YourSampler(torch.utils.data.sampler.Sampler):
    def __init__(self, mask, data_source):
        self.mask = mask
        self.data_source = data_source

    def __iter__(self):
        return iter([i.item() for i in torch.nonzero(mask)])

    def __len__(self):
        return len(self.data_source)

mnist = torchvision.datasets.MNIST(root="data", download=True, transform=transforms.ToTensor())
mask = [1 if mnist[i][1] == 0 or mnist[i][1] == 1 else 0 for i in range(len(mnist))]
mask = torch.tensor(mask)
sampler = YourSampler(mask, mnist)
new_data = torch.utils.data.DataLoader(mnist,sampler = sampler, shuffle=False)



In [None]:
# Check the dtype of the images in the first batch
for images, labels in new_data:
    print("Images dtype:", images.dtype)  # Print dtype of the images
    break  # Stop after the first batch

Images dtype: torch.float32


In [None]:
all_images = []
all_labels = []

# Iterate through the DataLoader and collect all batches
for images, labels in new_data:
    all_images.append(images)
    all_labels.append(labels)

# Concatenate the list of tensors to form a single tensor
all_images = torch.cat(all_images)
all_labels = torch.cat(all_labels)


all_images.data.shape  # output is #torch.Size([12700, 1, 28, 28])  [# of images, # of color channels, height of the image, width of the image]


torch.Size([12665, 1, 28, 28])

**Trying the split test**

first convert the extracted new_data into pandas df

In [None]:
import pandas as pd

data_list = []
for batch_data, batch_labels in new_data:
    # batch_data = batch_data.numpy()
    batch_data = batch_data.view(batch_data.size(0), -1).numpy()  # Flatten images
    batch_labels = batch_labels.numpy()
    data_list.extend(zip(batch_data, batch_labels))

full_data = pd.DataFrame(data_list, columns=['x', 'y'])
print(full_data)

print(full_data.dtypes)  # Check the data types of the DataFrame columns



                                                       x  y
0      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0
1      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1
2      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1
3      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1
4      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1
...                                                  ... ..
12660  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0
12661  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1
12662  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1
12663  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0
12664  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1

[12665 rows x 2 columns]
x    object
y     int64
dtype: object


In [None]:
print(full_data['y'])

0        0
1        1
2        1
3        1
4        1
        ..
12660    0
12661    1
12662    1
12663    0
12664    1
Name: y, Length: 12665, dtype: int64


In [None]:
data = full_data.sort_values('y').reset_index(drop=True)
data.head()

data['y'].value_counts()

Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
1,6742
0,5923


Now perform the test split for to get unbalanced batch proportions

In [None]:
batch_size = 10
p = 0.7 # The proportion of class 0 instances for all even batches
q = 0.9 # The proportion of class 0 instances for all odd batches


batch_index = 0

full_data['Batch'] = -1

while(True):
    prop = p if (batch_index % 2 == 0) else q
    num_class_a_elts = int(batch_size * prop)
    num_class_b_elts = batch_size - num_class_a_elts

    class_a_unassigned = full_data[(full_data['y'] == 0) & (full_data['Batch'] == -1)]
    class_b_unassigned = full_data[(full_data['y'] == 1) & (full_data['Batch'] == -1)]

    if class_a_unassigned.shape[0] < num_class_a_elts or \
        class_b_unassigned.shape[0] < num_class_b_elts:
        print("Total number of batches:", batch_index)
        break
    else:
        full_data.loc[class_a_unassigned.sample(num_class_a_elts).index, 'Batch'] = batch_index
        full_data.loc[class_b_unassigned.sample(num_class_b_elts).index, 'Batch'] = batch_index
        batch_index += 1

Total number of batches: 740


In [None]:
# Train-Test Split
test_size = 0.2
full_data = full_data[full_data['Batch'] != -1]  # remove the rows with -1 in the batch column from full_data

num_batches = full_data['Batch'].nunique()
print("Number of batches:", num_batches)

num_test_batches = int(num_batches * test_size)
num_train_batches = num_batches - num_test_batches

training_data = full_data[full_data['Batch'] < num_train_batches]
test_data = full_data[full_data['Batch'] >= num_train_batches]

num_batches = training_data['Batch'].nunique()
print("Number of training batches:", num_batches)

num_t_batches = test_data['Batch'].nunique()
print("Number of testing batches:", num_t_batches)

input_col = ['x']
#training_data.to_excel("traindata.xlsx")


Number of batches: 740
Number of training batches: 592
Number of testing batches: 148


In [None]:
training_data_batches = training_data.groupby('Batch')  # group training data by batch

# images in the same batch will have the same number in the batch column

# Show the first batch in training data
for num, batch in training_data_batches:
    print("Batch:", num)
    print(batch)
    break

Batch: 0
                                                       x  y  Batch
700    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0      0
751    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0      0
5579   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0      0
6656   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0      0
6709   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1      0
7418   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0      0
7507   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1      0
9528   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0      0
11582  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1      0
12384  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0      0


define the batch statistics function

In [None]:
def batch_stats(data):
    batch_proportions = []
    batch_weights = []

    # Group data by 'Batch'
    for batch, group in data.groupby('Batch'):
        # Count occurrences of 0s and 1s in the 'y' column
        numofzerosinbatch = (group['y'] == 0).sum()
        numofonesinbatch = (group['y'] == 1).sum()
        totalinbatch = numofzerosinbatch + numofonesinbatch

        # Calculate percentages (proportions)
        if totalinbatch > 0:
            zerostats = (numofzerosinbatch / totalinbatch) * 100
            onestats = (numofonesinbatch / totalinbatch) * 100
        else:
            zerostats = onestats = 0.0  # Handle cases where there is no data in the batch

        # Calculate weights based on the proportions in the batch
        if numofzerosinbatch > 0 and numofonesinbatch > 0:
            weight_for_class_0 = totalinbatch / (2 * numofzerosinbatch)
            weight_for_class_1 = totalinbatch / (2 * numofonesinbatch)
        else:
            weight_for_class_0 = weight_for_class_1 = 1.0  # Handle cases with only one class in the batch

        # Store the proportions and weights for the batch
        batch_proportions.append([zerostats, onestats])
        batch_weights.append([weight_for_class_0, weight_for_class_1])

    return batch_proportions, batch_weights





My neural network class

In [None]:
# NeuralNet Class
class NeuralNetwork(nn.Module):
    def __init__(self, n_features: int, n_hidden: int, n_new_classes: int) -> None:
        super(NeuralNetwork, self).__init__()
        self.h1 = nn.Linear(n_features, n_hidden)  # 784 input features to 100 hidden neurons
        self.out = nn.Linear(n_hidden, n_new_classes)   # 100 neurons to 2 output classes (not 10)

    def forward(self, x):  # weights and biases
        out = torch.relu(self.h1(x))
        out = self.out(out)
        return out

define a custom loss

In [None]:
class LossFunction(nn.Module):
    def __init__(self):
        super(LossFunction, self).__init__()

    def forward(self, predictions, mean_l):
        mean_p = torch.mean(predictions)   #take the mean of the models predictions for a single batch
        mean_l = torch.tensor(mean_l, dtype=torch.float32)  # compare it with the target mean value

        return torch.square(mean_p - mean_l)    # return the squared error

In [None]:
device = torch.device('cpu')
model = NeuralNetwork(n_features, n_hidden, n_new_classes).to(device)   #initialize my model

# Loss and optimizer
loss_fn = LossFunction()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [None]:
input_data = torch.tensor(test_data['x'].tolist(), dtype=torch.float32)

input_data

  input_data = torch.tensor(test_data['x'].tolist(), dtype=torch.float32)


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

Check where the error (multilabel-indicator targets) is coming from

- My predictions tensor has a shape of (n, 2)
- For each image, the model outputs raw logits for each class [logit score for class 0, logit score for class 1]
- It needs to have the shape (n,1), one value for each image so it can be compared properly with the actual target labels





In [None]:
predictions = model(input_data).detach().numpy()
predictions = np.where(predictions < 0.5, 1, 0)   # 1 for values < 0.5 and 0 for values >= 0.5
predictions = predictions[:, 0] # retrieve the first column, the probabilities for class 0

predictions[0:5]

array([1, 1, 1, 1, 1])

In [None]:
class_0_data = test_data[test_data['y'] == 1]
input_data_class_0 = torch.tensor(class_0_data['x'].tolist(), dtype=torch.float32)
predictions_class_0 = model(input_data_class_0).detach().numpy()
mu = np.mean(predictions_class_0)

mu

0.102287784

In [None]:
targets = test_data['y']
targets[0:5]

Unnamed: 0,y
12,0
13,1
17,0
22,1
35,0


In [None]:
def train(epoch):

    model.train()
    loss_epoch = []

    for num, batch in training_data_batches:

        prop = batch['y'].value_counts()[0] / batch.shape[0]    #calculates the proportions of the most frequent value in the batch

        # Create a tensor from the images in the batch
        model_input = torch.tensor(batch['x'].tolist(), dtype=torch.float32)


        output = model(model_input)  # Assuming the output is in the shape of batch size

        # Calculate loss using the custom loss function
        loss = loss_fn(output, prop)

        # Backpropagation and optimizer step
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Track loss for the epoch
        loss_epoch.append(loss.detach().numpy())

    input_data = torch.tensor(test_data['x'].tolist(), dtype=torch.float32)
  # input_data = torch.tensor(test_data[input_col].values, dtype=torch.float32)

    # Get model predictions
    predictions = model(input_data).detach().numpy()
    predictions = np.where(predictions < 0.5, 1, 0)   # 1 for probablilty < 0.5 and 0 for probability >= 0.5
    predictions = predictions[:, 0]  # retrieve the first row
    # maybe apply softmax for more classes get the more likely class

    targets = test_data['y']
    test_acc = accuracy_score(targets, predictions)  # Accuracy = Total Number of Predictions / Number of Correct Predictions



    # Examine this
    # Calculate mu: The average error for a class 1 instance
    class_0_data = test_data[test_data['y'] == 1]
    input_data_class_0 = torch.tensor(class_0_data['x'].tolist(), dtype=torch.float32)
    predictions_class_0 = model(input_data_class_0).detach().numpy()
    mu = np.mean(predictions_class_0)

    # Calculate nu: The average error for a class 0 instance
    class_1_data = test_data[test_data['y'] == 0]
    input_data_class_1 = torch.tensor(class_1_data['x'].tolist(), dtype=torch.float32)
    predictions_class_1 = model(input_data_class_1).detach().numpy()
    nu = np.mean(1 - predictions_class_1)

    # accuracy of mu and nu?

    return loss_epoch, test_acc, mu, nu





In [None]:
loss_history = []
test_history = []
mu_history = []
nu_history = []

for epoch in range(1, n_epochs + 1):
    loss_epoch, test_acc, mu, nu = train(epoch)
    print("Epoch:", epoch, "Loss:", np.mean(loss_epoch), "Test Accuracy:", test_acc, "mu, nu:", mu, nu)
    loss_history.append(loss_epoch)
    test_history.append(test_acc)
    mu_history.append(mu)
    nu_history.append(nu)

Epoch: 1 Loss: 0.016057061 Test Accuracy: 0.9898648648648649 mu, nu: 0.15643258 0.081655376
Epoch: 2 Loss: 0.0038652308 Test Accuracy: 0.9912162162162163 mu, nu: 0.06065909 0.05175636
Epoch: 3 Loss: 0.0034738062 Test Accuracy: 0.9925675675675676 mu, nu: 0.044213854 0.04188663
Epoch: 4 Loss: 0.0031710896 Test Accuracy: 0.9932432432432432 mu, nu: 0.041161552 0.035543103
Epoch: 5 Loss: 0.0028468973 Test Accuracy: 0.9939189189189189 mu, nu: 0.040104523 0.030767724
Epoch: 6 Loss: 0.0025236995 Test Accuracy: 0.9939189189189189 mu, nu: 0.041266307 0.027371936
Epoch: 7 Loss: 0.0022148436 Test Accuracy: 0.9939189189189189 mu, nu: 0.04289672 0.024836455
Epoch: 8 Loss: 0.0019164982 Test Accuracy: 0.995945945945946 mu, nu: 0.04303621 0.021644443
Epoch: 9 Loss: 0.0016496099 Test Accuracy: 0.9966216216216216 mu, nu: 0.04328472 0.01946662
Epoch: 10 Loss: 0.001426054 Test Accuracy: 0.9979729729729729 mu, nu: 0.043185174 0.017955247
Epoch: 11 Loss: 0.0012394551 Test Accuracy: 0.9979729729729729 mu, nu:

Testing with a dataloader

In [None]:
from torch.utils.data import DataLoader, TensorDataset   # TensorDataset allows for the creation of a dataset from tensors

# Convert the 'x' and 'y' columns to tensors
# Assuming 'x' is already flattened as a NumPy array and 'y' contains labels
test_images = torch.tensor(test_data['x'].tolist(), dtype=torch.float32)  # Convert both to tensors
test_labels = torch.tensor(test_data['y'].tolist(), dtype=torch.int)

# Create a TensorDataset
test_dataset = TensorDataset(test_images, test_labels)

# Step 4: Create a DataLoader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [None]:
with torch.no_grad():
    n_samples = 0
    n_correct = 0

    for images, labels in test_loader:
        images = images.reshape(-1, 28 * 28).to(device)
        labels = labels.to(device)

        outputs = model(images)
        _, predictions = torch.max(outputs, 1)

        n_samples += labels.shape[0]
        n_correct += (predictions == labels).sum().item()

    accuracy = n_correct / n_samples
    print(f"Accuracy is: {accuracy}")

Accuracy is: 0.7972972972972973
