In [None]:
import pandas as pd
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
import os
import matplotlib
import matplotlib.pyplot as plt
import copy
import random
import time

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets

In [None]:
Data_periods = 10
Data_Days = 90
LDays = 548

In [None]:
model_type = "Centralized_FL"
windows = f'{Data_periods}W'
label_days = f'{Data_Days}-Central'
prediction_days = f'{LDays}'

In [None]:
# Load data
CHF_data = pd.read_csv(f'CHF_Data_1/CHF_Unlinked_{Data_Days}D_{Data_periods}W.csv')
CHF_Labels_2 = pd.read_csv(f'CHF_Data_1/CHF_Labels_FL2_{LDays}.csv')

In [None]:
CHF_Labels = pd.DataFrame()
CHF_Labels['ClientId'] = CHF_Labels_2['Value'] 
CHF_Labels['Label'] = CHF_Labels_2['Label']

In [None]:
CHF_data['ClientId'] = CHF_data['ClientId'].astype(str)
CHF_Labels['ClientId'] = CHF_Labels['ClientId'].astype(str)

In [None]:
CHF_data['Agency'].nunique()

In [None]:
CHF_unique = CHF_data.drop_duplicates(subset='ClientId')

In [None]:
Agency_Ids = CHF_unique.groupby('Agency')['ClientId'].apply(list).reset_index()

In [None]:
Agency_Ids = Agency_Ids.set_index('Agency')

In [None]:
# Group by 'Agency' and join 'ClientId' values separated by commas
agency_dict = Agency_Ids.groupby('Agency')['ClientId'].apply(lambda x: ','.join(x.astype(str))).to_dict()

# Convert the string of ClientIds into a list for each Agency and remove brackets
for agency in agency_dict:
    # Strip the brackets and then split by comma
    agency_dict[agency] = agency_dict[agency].strip('[]').split(',')

In [None]:
pivoted_data = CHF_data.drop('Agency', axis=1)

In [None]:
CHF_unique.reset_index(drop=True, inplace=True)
pivoted_data.reset_index(drop=True, inplace=True)

In [None]:
Agency = CHF_unique['Agency']

In [None]:
pivoted_data['Agency'] = Agency

In [None]:
list1 = agency_dict[4]

In [None]:
filtered_df = pivoted_data[pivoted_data['ClientId'].isin(agency_dict[4])]

In [None]:
# Merge and preprocess data
data = pd.merge(pivoted_data, CHF_Labels, on='ClientId')
data['Label'] = data['Label'].map({'Trn': 1, 'Epi': 2, 'Chr': 3})  # Replace class1, class2, class3 with actual class names

In [None]:
# Split data
X = data
y = data['Label'].values
y_indices = y - 1  # Convert labels to 0, 1, and 2

In [None]:
for agency in agency_dict:
    # Strip whitespaces and convert each string in the list to an integer
    agency_dict[agency] = [client_id.strip() for client_id in agency_dict[agency]]

In [None]:
# Step 1: Create a combined column for stratification
data['stratify_col'] = data['Agency'].astype(str) + "_" + y_indices.astype(str)

In [None]:
# # Perform the train-test split
# X_train, X_test, y_train, y_test = train_test_split(
#     X, 
#     y_indices, 
#     test_size=0.2, 
#     random_state=42, 
#     stratify=data['stratify_col']
# )

In [None]:
y = pd.DataFrame()

In [None]:
# Filter out the classes with too few samples
filtered_data = X[~X['stratify_col'].isin([])]

# Split the filtered data
y['Label'] = filtered_data['Label']
y['ClientId'] = filtered_data['ClientId']

X1 = filtered_data
X_train, X_test, y_train, y_test = train_test_split(
    X1, y, test_size=0.2, random_state=42, stratify=filtered_data['stratify_col']
)

# Manually add the excluded samples back to the train or test set
excluded_data = X[X['Label'].isin([])]

# You can decide how to add these back, for example, to X_train and y_train
# Just as an example, adding to X_train and y_train
X_train = pd.concat([X_train, excluded_data])
y_train = pd.concat([y_train, excluded_data['Label']])

In [None]:
CHF_Labels_3A1 = pd.read_csv(f'CHF_Data_1/CHF_Labels_{LDays}.csv')
CHF_Labels_3A = pd.DataFrame()
CHF_Labels_3A['ClientId'] = CHF_Labels_3A1['ClientId'] 
CHF_Labels_3A['Label'] = CHF_Labels_3A1['ListNumber']

In [None]:
CHF_Labels_3B1 = pd.read_csv(f'CHF_Data_1/CHF_Labels_FL2_{LDays}.csv')
CHF_Labels_3B = pd.DataFrame()
CHF_Labels_3B['ClientId'] = CHF_Labels_3B1['Value'] 
CHF_Labels_3B['Label'] = CHF_Labels_3B1['Label']

In [None]:
CHF_Labels_3A['ClientId'] = CHF_Labels_3A['ClientId'].astype(str)

In [None]:
# Step 1: Split ClientId in CHF_Labels_3B to extract IntegerA
CHF_Labels_3B['IntegerA'] = CHF_Labels_3B['ClientId'].apply(lambda x: x.split('_')[0])

In [None]:
merged_df = pd.merge(CHF_Labels_3B, CHF_Labels_3A.rename(columns={'Label': 'Label_3A'}), 
                     left_on='IntegerA', right_on='ClientId', how='left')

# Step 4: Update Label in CHF_Labels_3B with Label from CHF_Labels_3A where there's a match
CHF_Labels_3B['Label'] = merged_df['Label_3A']

# Step 5: Drop the temporary columns
CHF_Labels_3B.drop(['IntegerA'], axis=1, inplace=True)

In [None]:
CHF_Labels_4 = pd.DataFrame()
CHF_Labels_4 = CHF_Labels_3B

In [None]:
CHF_Labels_4['Label'] = CHF_Labels_4['Label'].map({'Trn': 0, 'Epi': 1, 'Chr': 2})  # Replace class1, class2, class3 with actual class names

In [None]:
y_test.reset_index(drop=True,inplace = True)

In [None]:
# Extract the second integer from 'ClientID' and convert it to int
y_test['Integer2'] = y_test['ClientId'].apply(lambda x: int(x.split('_')[1]))

# Initialize a dictionary to hold the subsets of y_test
dfs = {}

# Loop through each unique 'integer2' value to create separate DataFrames
for value in y_test['Integer2'].unique():
    dfs[value] = y_test[y_test['Integer2'] == value]


# Example: Access the DataFrame where 'integer2' is 1 (replace with actual unique values)
y_test_4 = dfs[4]
y_test_13 = dfs[13]
y_test_55 = dfs[55]
y_test_188 = dfs[188]
y_test_213 = dfs[213]
y_test_225 = dfs[225]
y_test_330 = dfs[330]
y_test_333 = dfs[333]

In [None]:
# Extract the second integer from 'ClientID' and convert it to int
X_test['Integer2'] = X_test['ClientId'].apply(lambda x: int(x.split('_')[1]))

# Initialize a dictionary to hold the subsets of y_test
dfs = {}

# Loop through each unique 'integer2' value to create separate DataFrames
for value in X_test['Integer2'].unique():
    dfs[value] = X_test[X_test['Integer2'] == value]


# Example: Access the DataFrame where 'integer2' is 1 (replace with actual unique values)
X_test_4 = dfs[4]
X_test_13 = dfs[13]
X_test_55 = dfs[55]
X_test_188 = dfs[188]
X_test_213 = dfs[213]
X_test_225 = dfs[225]
X_test_330 = dfs[330]
X_test_333 = dfs[333]

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assuming dfs_keys is a list of your specific keys for which you have X_test and y_test DataFrames stored in dictionaries
dfs_keys = [4, 13, 55, 188, 213, 225, 330, 333]

# Assuming X_test_dfs and y_test_dfs are dictionaries holding your respective DataFrames
X_test_dfs = {key: globals()[f'X_test_{key}'] for key in dfs_keys}
y_test_dfs = {key: globals()[f'y_test_{key}'] for key in dfs_keys}

# Initialize StandardScaler
scaler = StandardScaler()

for key in dfs_keys:
    # Sort by 'ClientId'
    X_test_dfs[key] = X_test_dfs[key].sort_values(by='ClientId')
    y_test_dfs[key] = y_test_dfs[key].sort_values(by='ClientId')
    
    # Drop 'ClientId' and 'Integer2' columns from X_test_dfs
    X_test_dfs[key] = X_test_dfs[key].drop(columns=['Integer2'])
    y_test_dfs[key] = y_test_dfs[key].drop(columns=['ClientId', 'Integer2'])

    # Apply StandardScaler to X_test
    # Fit and transform, then convert back to DataFrame to retain column names
    #X_test_dfs[key] = pd.DataFrame(scaler.fit_transform(X_test_dfs[key]), columns=X_test_dfs[key].columns, index=X_test_dfs[key].index)

    # Squeeze y_test_dfs[key] if it's a numpy array
    # This step assumes your y_test data is in a format that can be squeezed (e.g., a numpy array)
    # If y_test_dfs[key] is a DataFrame or Series, this step may need adjustment
    if isinstance(y_test_dfs[key], np.ndarray):
        y_test_dfs[key] = np.squeeze(y_test_dfs[key])
    elif isinstance(y_test_dfs[key], pd.Series):
        y_test_dfs[key] = y_test_dfs[key].to_numpy().squeeze()



# Update the global namespace if necessary, or work directly with the dictionaries
for key in dfs_keys:
    globals()[f'X_test_{key}'] = X_test_dfs[key]
    globals()[f'y_test_{key}'] = y_test_dfs[key]


In [None]:
X_test = X_test.reset_index(drop=True)
X_test['Label'] = y_test['Label']
X_test.dropna(subset=['Label'], inplace=True)

In [None]:
# class_counts = X['stratify_col'].value_counts()
# print(class_counts)

In [None]:
# # Convert to PyTorch tensors
# X_train = torch.tensor(X_train, dtype=torch.float32)
# X_test = torch.tensor(X_test, dtype=torch.float32)
# y_train = torch.tensor(y_train, dtype=torch.long)
# y_test = torch.tensor(y_test, dtype=torch.long)

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear((X_train.shape[1]-4), 512),
            nn.ReLU(),
            nn.Dropout(0.25),  # Dropout layer
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.25),  # Another Dropout layer
            nn.Linear(128, 16),
            nn.ReLU(),
            nn.Dropout(0.25),  # Another Dropout layer
            nn.Linear(16, 3)
        )
        
    def forward(self, x):
        return self.layers(x)

#model = MLP()

In [None]:
import torch.nn.functional as F

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X  # Features (already processed)
        self.y = y  # Labels (already processed)

    def __len__(self):
        return len(self.X)

    def __getitem__(self,idx):
        sample = {
            "data": self.X[idx],
            "label": self.y[idx]
        }
        # Retrieve the data and label at the given index
        # X_sample = self.X[idx]
        # y_sample = self.y[idx]

        return sample



In [None]:
class_weights_tensor = torch.tensor(np.array([1, 12, 18]), dtype=torch.float32)

In [None]:
class ClientUpdate(object):
  def __init__(self, dataset, batchSize, learning_rate, epochs, idxs):
    # Filter the DataFrame and split into X and y
    # Removing extra quotes from each element in the list
    idxs = [s.strip("'") for s in idxs]
    dataset['ClientId'] = dataset['ClientId'].astype(str)
    y = dataset[dataset['ClientId'].isin(idxs)][['Label']]  # Assuming idxs is your client indices list
    X = dataset[dataset['ClientId'].isin(idxs)].drop(['Label', 'ClientId', 'Agency','stratify_col'], axis=1)

    # Convert to tensors (outside of CustomDataset)
    X_tensor = torch.tensor(X.values.astype(float), dtype=torch.float32)
    y_tensor = (torch.tensor(y.values, dtype=torch.long) - 1).squeeze()  # Subtract 1 for zero-based indexing
    # Number of classes (assuming you know the number of classes in your task)
    num_classes = 3  # Change this to the actual number of classes

    # Perform one-hot encoding
    y_one_hot = F.one_hot(y_tensor, num_classes=num_classes)
    y_one_hot = y_one_hot.float()
    #print(X_tensor.shape,y_one_hot)

    # Create instance of CustomDataset
    #custom_dataset = CustomDataset(X_tensor, y_tensor, idxs)
    self.train_loader = DataLoader(CustomDataset(X_tensor, y_one_hot), batch_size=batchSize, shuffle=True)
    #print(len(self.train_loader.dataset))
    self.learning_rate = learning_rate
    self.epochs = epochs

  def train(self, model):

    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor) 
    optimizer = torch.optim.SGD(model.parameters(), lr=self.learning_rate, momentum=0.5)
    # optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)

    e_loss = []
    for epoch in range(1, self.epochs+1):

      train_loss = 0.0

      model.train()
      #for data, labels in self.train_loader:
      for batch in self.train_loader:
        data = batch["data"]
        labels = batch["label"]
        #print(f"Data in train loader {data.shape} and Labels in train loader {labels.shape}")
        if torch.cuda.is_available():
          data, labels = data.cuda(), labels.cuda()

        # clear the gradients
        optimizer.zero_grad()
        # make a forward pass
        output = model(data)
        #print("Output",output.shape,output.type,output)
        #print("Labels",labels.shape,labels.type,labels)
        # calculate the loss
        loss = criterion(output, labels)
        # do a backwards pass
        loss.backward()
        # perform a single optimization step
        optimizer.step()
        # update training loss
        train_loss += loss.item()*data.size(0)

      # average losses
      train_loss = train_loss/len(self.train_loader.dataset)
      e_loss.append(train_loss)

    total_loss = sum(e_loss)/len(e_loss)

    return model.state_dict(), total_loss

In [None]:
def training(model, rounds, batch_size, lr, ds, data_dict, C, K, E, plt_title, plt_color):
  """
  Function implements the Federated Averaging Algorithm from the FedAvg paper.
  Specifically, this function is used for the server side training and weight update

  Params:
    - model:           PyTorch model to train
    - rounds:          Number of communication rounds for the client update
    - batch_size:      Batch size for client update training
    - lr:              Learning rate used for client update training
    - ds:              Dataset used for training
    - data_dict:       Type of data partition used for training (IID or non-IID)
    - C:               Fraction of clients randomly chosen to perform computation on each round
    - K:               Total number of clients
    - E:               Number of training passes each client makes over its local dataset per round
    - tb_writer_name:  Directory name to save the tensorboard logs
  Returns:
    - model:           Trained model on the server
  """

  # global model weights
  global_weights = model.state_dict()

  # training loss
  train_loss = []
  
  # measure time
  start = time.time()

  for curr_round in range(1, rounds+1):
    w, local_loss = [], []

    m = max(int(C*K), 1)
    
    S_t = [4,13,55,188,213,225,330,333]
    for k in S_t:
      local_update = ClientUpdate(dataset=ds, batchSize=batch_size, learning_rate=lr, epochs=E, idxs=data_dict[k])
      weights, loss = local_update.train(model=copy.deepcopy(model))

      w.append(copy.deepcopy(weights))
      local_loss.append(copy.deepcopy(loss))

    # updating the global weights
    weights_avg = copy.deepcopy(w[0])
    for k in weights_avg.keys():
      for i in range(1, len(w)):
        weights_avg[k] += w[i][k]

      weights_avg[k] = torch.div(weights_avg[k], len(w))

    global_weights = weights_avg

    # move the updated weights to our model state dict
    model.load_state_dict(global_weights)

    # loss
    loss_avg = sum(local_loss) / len(local_loss)
    print('Round: {}... \tAverage Loss: {}'.format(curr_round, round(loss_avg, 3)))
    train_loss.append(loss_avg)

  end = time.time()
  fig, ax = plt.subplots()
  x_axis = np.arange(1, rounds+1)
  y_axis = np.array(train_loss)
  ax.plot(x_axis, y_axis, 'tab:'+plt_color)

  ax.set(xlabel='Number of Rounds', ylabel='Train Loss',
       title=plt_title)
  ax.grid()
  fig.savefig(plt_title+'.jpg', format='jpg')
  print("Training Done!")
  print("Total time taken to Train: {}".format(end-start))
  
  return model

In [None]:
def testing(model, dataset, bs, criterion, num_classes, classes):
    test_loss = 0.0
    true_positives = np.zeros(num_classes)
    false_positives = np.zeros(num_classes)
    false_negatives = np.zeros(num_classes)

    y = dataset['Label']
    X = dataset.drop(['Label', 'ClientId', 'Agency', 'stratify_col'], axis=1)

    X_tensor = torch.tensor(X.values.astype(np.float32), dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.long) - 1

    y_one_hot = F.one_hot(y_tensor, num_classes=num_classes).float()

    test_loader = DataLoader(CustomDataset(X_tensor, y_one_hot), batch_size=bs)
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            data = batch["data"]
            labels = batch["label"]
            if torch.cuda.is_available():
                data, labels = data.cuda(), labels.cuda()

            output = model(data)
            loss = criterion(output, labels.max(dim=1)[1])
            test_loss += loss.item() * data.size(0)

            _, pred = torch.max(output, 1)
            true_labels = labels.max(dim=1)[1]

            for i in range(num_classes):
                true_positives[i] += ((pred == i) & (true_labels == i)).cpu().sum().item()
                false_positives[i] += ((pred == i) & (true_labels != i)).cpu().sum().item()
                false_negatives[i] += ((pred != i) & (true_labels == i)).cpu().sum().item()

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    macro_avg_precision = np.nanmean(precision)  # Use nanmean to handle division by zero
    macro_avg_recall = np.nanmean(recall)

    return macro_avg_precision, macro_avg_recall, precision, recall

In [None]:
# number of training rounds
rounds = 75
# client fraction
C = 1
# number of clients
K = 8
# number of training passes on local dataset for each round
E = 15
# batch size
batch_size = 500
# learning Rate
lr=0.0195
# load model
MLP_NN = MLP()



In [None]:
num_classes = 3  # Example number of classes
classes_test = ['Class 0', 'Class 1', 'Class 2']  # Example class names
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor) 
Agency_no = 333

# Initialize sums for macro averages and class-wise metrics
sum_macro_avg_precision = 0
sum_macro_avg_recall = 0
sum_class_precision = np.zeros(num_classes)
sum_class_recall = np.zeros(num_classes)
runs = 5
# Train and test the model 10 times
for _ in range(runs):
    MLP_NN_iid_trained = training(MLP_NN, rounds, batch_size, lr, X_train, agency_dict, C, K, E, "CHF", 'red')
    
    macro_avg_precision, macro_avg_recall, class_precision, class_recall = testing(
        model=MLP_NN_iid_trained, 
        dataset=X_test_333, 
        bs=batch_size, 
        criterion=criterion, 
        num_classes=num_classes, 
        classes=classes_test
    )
    
    sum_macro_avg_precision += macro_avg_precision
    sum_macro_avg_recall += macro_avg_recall
    sum_class_precision += class_precision
    sum_class_recall += class_recall

# Calculate the average of macro-averaged precision and recall over all runs
average_macro_avg_precision = sum_macro_avg_precision / runs
average_macro_avg_recall = sum_macro_avg_recall / runs
average_class_precision = sum_class_precision / runs
average_class_recall = sum_class_recall / runs

print(f"Average Macro Average Precision: {average_macro_avg_precision * 100:.2f}%")
print(f"Average Macro Average Recall: {average_macro_avg_recall * 100:.2f}%")
for i, class_name in enumerate(classes_test):
    print(f"Average Precision for {class_name}: {average_class_precision[i] * 100:.2f}%")
    print(f"Average Recall for {class_name}: {average_class_recall[i] * 100:.2f}%")


In [None]:
# Replace these with your actual values
# macro_avg_precision = precision # Replace with your actual value
# macro_avg_recall = recall  # Replace with your actual value

data = {
    "Model Type": [model_type],
    "Runs": [runs],
    "Agency": [Agency_no],
    "Windows": [windows],
    "Label Days": [label_days],
    "Prediction Days": [prediction_days],
    "Average Precision": [macro_avg_precision],
    "Average Recall": [macro_avg_recall],
    "Per class average precision": [average_class_precision],
    "Per class average recall": [average_class_recall]
}

new_data_df = pd.DataFrame(data)

# File name
excel_filename = "NRes/Model_Results_FL.xlsx"

# Check if the file exists
try:
    # If it exists, read the existing data and append the new data
    existing_data_df = pd.read_excel(excel_filename)
    combined_df = pd.concat([existing_data_df, new_data_df], ignore_index=True)
except FileNotFoundError:
    # If the file does not exist, just use the new data
    combined_df = new_data_df

# Save the combined data back to the Excel file
combined_df.to_excel(excel_filename, index=False)
