In [45]:
import torch
from torch import nn

In [46]:
# Global Neural Network Model
class SentimentAnalysisModel_global(nn.Module):
    def __init__(self):
        super(SentimentAnalysisModel_global, self).__init__()
        self.fc1 = nn.Linear(10000, 2)
        self.fc2 = nn.Linear(2, 3)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        # Replace BatchNorm with GroupNorm
        self.groupnorm = nn.GroupNorm(1, 2)  # GroupNorm with 1 group and 2 channels

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.groupnorm(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [47]:
# Setting fixed seed for reproducibility of the random weights and biases
torch.manual_seed(19)

# Initialized global model with random weights and biases
model_global = SentimentAnalysisModel_global()

In [48]:
# See the object containing the randomn weights and biases for this global model
model_global.state_dict()

OrderedDict([('fc1.weight',
              tensor([[ 0.0094, -0.0060,  0.0076,  ..., -0.0070,  0.0011,  0.0049],
                      [-0.0077, -0.0098, -0.0039,  ..., -0.0005, -0.0024, -0.0059]])),
             ('fc1.bias', tensor([0.0017, 0.0014])),
             ('fc2.weight',
              tensor([[ 0.5814,  0.0490],
                      [-0.1220, -0.4694],
                      [ 0.4106, -0.6040]])),
             ('fc2.bias', tensor([-0.5295,  0.3521,  0.5361])),
             ('groupnorm.weight', tensor([1., 1.])),
             ('groupnorm.bias', tensor([0., 0.]))])

In [49]:
# Save global model weights and biases
torch.save(model_global.state_dict(), 'global_parameters.pt')

In [50]:
import pandas as pd

node1_df = pd.read_csv('node1.csv')

In [51]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from opacus import PrivacyEngine

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Function to check and balance the dataset
def balance_dataset(df):
    sentiment_counts = df['Sentiment'].value_counts()
    if any(sentiment_counts != sentiment_counts[0]):
        ros = RandomOverSampler(random_state=42)
        df_balanced, _ = ros.fit_resample(df, df['Sentiment'])
        return df_balanced
    return df

# Text preprocessing steps
def preprocess_text(text):
    # Text cleaning
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)

    # Normalization (using lemmatization here)
    lemmatizer = WordNetLemmatizer()
    normalized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Stop words removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in normalized_tokens if token not in stop_words]

    return filtered_tokens

# Function to create embeddings
def create_embeddings(sentences):
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    return model

# Function to do padding
def pad_or_truncate_sequences(sequences, fixed_length, vector_size):
    # Initialize a zero-filled 3D array: number of sequences x fixed_length x vector_size
    adjusted_sequences = np.zeros((len(sequences), fixed_length, vector_size))
    
    for i, sequence in enumerate(sequences):
        sequence_length = len(sequence)
        if sequence_length > fixed_length:
            # Truncate the sequence
            adjusted_sequences[i, :, :] = np.array(sequence[:fixed_length])
        else:
            # Pad the sequence with zeros
            adjusted_sequences[i, :sequence_length, :] = np.array(sequence)
    
    return adjusted_sequences

# Custom Neural Network Model
class SentimentAnalysisModel_global(nn.Module):
    def __init__(self, input_size):
        super(SentimentAnalysisModel_global, self).__init__()
        self.fc1 = nn.Linear(input_size, 2)
        self.fc2 = nn.Linear(2, 3)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        # Replace BatchNorm with GroupNorm
        self.groupnorm = nn.GroupNorm(1, 2)  # GroupNorm with 1 group and 2 channels

    def forward(self, x):
        x = self.relu(self.fc1(x))
        # x = self.batchnorm(x)
        x = self.groupnorm(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

def submodel_one(global_parameters_path, df_input):
    # Step 1: Check and balance dataset
    df = balance_dataset(df_input)

    # Step 2: Preprocess text data
    df['processed_caption'] = df['Caption'].apply(preprocess_text)
    sentences = df['processed_caption'].tolist()
    word2vec_model = create_embeddings(sentences)

    vectorized_sentences = [[word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv] for sentence in sentences]

    # Adjusting sequences to the fixed input size of 100
    # Assuming each word vector from Word2Vec is of size 100
    vector_size = 100
    fixed_input_size = 100
    modified_input_size = fixed_input_size * vector_size  # 10000
    adjusted_sequences = pad_or_truncate_sequences(vectorized_sentences, fixed_input_size, vector_size)

    # Flatten the sequences for input to the neural network
    adjusted_sequences = adjusted_sequences.reshape(len(adjusted_sequences), -1)

    # Convert target variable to numerical format
    target = pd.get_dummies(df['Sentiment']).values

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(adjusted_sequences, target, test_size=0.25, random_state=33)

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    # DataLoader
    train_data = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_data, batch_size=64)

    # Load model with pretrained parameters
    model = SentimentAnalysisModel_global(input_size=modified_input_size)
    model.load_state_dict(torch.load(global_parameters_path))
    model.train()

    # Step 3: Train model with differential privacy
    optimizer = optim.Adam(model.parameters(), lr=0.1)
    criterion = nn.CrossEntropyLoss()
    privacy_engine = PrivacyEngine()
    model, optimizer, train_loader = privacy_engine.make_private(
        module=model,
        optimizer=optimizer,
        data_loader=train_loader,
        noise_multiplier=1.0,
        max_grad_norm=1.0
    )

    # Training loop
    for epoch in range(10):
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

    # Return updated model parameters
    return model.state_dict()

# Example usage
updated_parameters = submodel_one('global_parameters.pt', node1_df)
updated_parameters




OrderedDict([('_module.fc1.weight',
              tensor([[ 0.3400,  0.1648, -0.0120,  ...,  0.2103,  0.2675,  0.1745],
                      [ 0.2891,  0.2614, -0.1232,  ..., -0.0202,  0.5096,  0.6269]])),
             ('_module.fc1.bias', tensor([-0.1213,  0.3632])),
             ('_module.fc2.weight',
              tensor([[ 0.5619,  0.0126],
                      [ 0.5260, -0.4331],
                      [ 0.6782, -0.5148]])),
             ('_module.fc2.bias', tensor([-0.4159,  0.0047,  0.6919])),
             ('_module.groupnorm.weight', tensor([0.6429, 1.2070])),
             ('_module.groupnorm.bias', tensor([ 0.2980, -0.3828]))])

In [52]:
# To check the type of object and the dimensions of (an example of) updated_parameters

from collections import OrderedDict
import torch

# Assuming your OrderedDict object is named updated_parameters
updated_parameters = OrderedDict([
    ('_module.fc1.weight', torch.tensor([[ 0.3400,  0.1648, -0.0120, 0.2103,  0.2675,  0.1745], [ 0.2891,  0.2614, -0.1232, -0.0202,  0.5096,  0.6269]])),
    ('_module.fc1.bias', torch.tensor([-0.1213,  0.3632])),
    ('_module.fc2.weight', torch.tensor([[ 0.5619,  0.0126], [ 0.5260, -0.4331], [ 0.6782, -0.5148]])),
    ('_module.fc2.bias', torch.tensor([-0.4159,  0.0047,  0.6919])),
    ('_module.groupnorm.weight', torch.tensor([0.6429, 1.2070])),
    ('_module.groupnorm.bias', torch.tensor([ 0.2980, -0.3828]))
])

# Get the type of the object
object_type = type(updated_parameters)
print(f"Type of the object: {object_type}")

# Iterate through the OrderedDict and print the shape of each tensor
for key, tensor in updated_parameters.items():
    print(f"Shape of {key}: {tensor.size()}")

Type of the object: <class 'collections.OrderedDict'>
Shape of _module.fc1.weight: torch.Size([2, 6])
Shape of _module.fc1.bias: torch.Size([2])
Shape of _module.fc2.weight: torch.Size([3, 2])
Shape of _module.fc2.bias: torch.Size([3])
Shape of _module.groupnorm.weight: torch.Size([2])
Shape of _module.groupnorm.bias: torch.Size([2])


In [None]:
# ie. the object above is an OrderedDict object which contains PyTorch tensors

In [53]:
# Hint on how to combine multiple OrderedDict objects

# To combine the two different OrderedDict objects into a single list or array, you can simply iterate 
# over both OrderedDict objects and append their elements to a list. Since each element in the OrderedDict 
# is a key-value pair (where the key is a string and the value is a PyTorch tensor), you can store each pair 
# as a tuple within the list.

from collections import OrderedDict
import torch

# First OrderedDict object
model_parameters1 = OrderedDict([
    ('_module.fc1.weight', torch.tensor([[ 0.3400,  0.1648, -0.0120,  0.2103,  0.2675,  0.1745],
                                         [ 0.2891,  0.2614, -0.1232, -0.0202,  0.5096,  0.6269]])),
    ('_module.fc1.bias', torch.tensor([-0.1213,  0.3632])),
    ('_module.fc2.weight', torch.tensor([[ 0.5619,  0.0126],
                                         [ 0.5260, -0.4331],
                                         [ 0.6782, -0.5148]])),
    ('_module.fc2.bias', torch.tensor([-0.4159,  0.0047,  0.6919])),
    ('_module.groupnorm.weight', torch.tensor([0.6429, 1.2070])),
    ('_module.groupnorm.bias', torch.tensor([ 0.2980, -0.3828]))
])

# Second OrderedDict object
model_parameters2 = OrderedDict([
    ('_module.fc1.weight', torch.tensor([[ 0.3462,  0.1719, -0.0215,  0.2117,  0.2747,  0.1678],
                                         [ 0.3037,  0.2656, -0.1243, -0.0284,  0.5138,  0.6400]])),
    ('_module.fc1.bias', torch.tensor([-0.2414,  0.3605])),
    ('_module.fc2.weight', torch.tensor([[-0.4925, -0.2702],
                                         [ 0.0939, -0.7728],
                                         [ 0.0884, -0.3885]])),
    ('_module.fc2.bias', torch.tensor([-0.3595,  0.0375,  0.5166])),
    ('_module.groupnorm.weight', torch.tensor([0.7792, 1.2446])),
    ('_module.groupnorm.bias', torch.tensor([-0.1800, -0.4572]))
])

# Combining both OrderedDict objects into one list
combined_list = [model_parameters1, model_parameters2]

# If you want an array, you can use numpy to convert the list to an array
# import numpy as np
# combined_array = np.array(combined_list)


In [54]:
combined_list

[OrderedDict([('_module.fc1.weight',
               tensor([[ 0.3400,  0.1648, -0.0120,  0.2103,  0.2675,  0.1745],
                       [ 0.2891,  0.2614, -0.1232, -0.0202,  0.5096,  0.6269]])),
              ('_module.fc1.bias', tensor([-0.1213,  0.3632])),
              ('_module.fc2.weight',
               tensor([[ 0.5619,  0.0126],
                       [ 0.5260, -0.4331],
                       [ 0.6782, -0.5148]])),
              ('_module.fc2.bias', tensor([-0.4159,  0.0047,  0.6919])),
              ('_module.groupnorm.weight', tensor([0.6429, 1.2070])),
              ('_module.groupnorm.bias', tensor([ 0.2980, -0.3828]))]),
 OrderedDict([('_module.fc1.weight',
               tensor([[ 0.3462,  0.1719, -0.0215,  0.2117,  0.2747,  0.1678],
                       [ 0.3037,  0.2656, -0.1243, -0.0284,  0.5138,  0.6400]])),
              ('_module.fc1.bias', tensor([-0.2414,  0.3605])),
              ('_module.fc2.weight',
               tensor([[-0.4925, -0.2702],
            

In [55]:
# Print the type of the outer object
print("Type of the outer object:", type(combined_list))

# Iterate through each OrderedDict's key-value pairs
for od in combined_list:
    for key, tensor in od.items():
        print(f"Key: {key}, Shape: {tensor.shape}")


Type of the outer object: <class 'list'>
Key: _module.fc1.weight, Shape: torch.Size([2, 6])
Key: _module.fc1.bias, Shape: torch.Size([2])
Key: _module.fc2.weight, Shape: torch.Size([3, 2])
Key: _module.fc2.bias, Shape: torch.Size([3])
Key: _module.groupnorm.weight, Shape: torch.Size([2])
Key: _module.groupnorm.bias, Shape: torch.Size([2])
Key: _module.fc1.weight, Shape: torch.Size([2, 6])
Key: _module.fc1.bias, Shape: torch.Size([2])
Key: _module.fc2.weight, Shape: torch.Size([3, 2])
Key: _module.fc2.bias, Shape: torch.Size([3])
Key: _module.groupnorm.weight, Shape: torch.Size([2])
Key: _module.groupnorm.bias, Shape: torch.Size([2])
