In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
Data_periods = 10
Data_Days = 90
LDays = 548

Agency No: 4,12,55,188,213,225,330,333

In [None]:
model_type = "Centralized_FL"
windows = f'{Data_periods}W'
label_days = f'{Data_Days}-Central'
prediction_days = f'{LDays}'
Agency_no = 330

In [None]:
# Load data
CHF_data = pd.read_csv(f'CHF_Data_1/CHF_Unlinked_{Data_Days}D_{Data_periods}W.csv')
CHF_Labels_2 = pd.read_csv(f'CHF_Data_1/CHF_Labels_Local_{LDays}.csv')

In [None]:
CHF_data = CHF_data[CHF_data['Agency'] == Agency_no]

In [None]:
# Filter CHF_Labels to keep only those rows where ClientId exists in CHF_data
CHF_Labels_2 = CHF_Labels_2[CHF_Labels_2['ClientId'].isin(CHF_data['ClientId'])]

In [None]:
CHF_Labels = pd.DataFrame()
CHF_Labels['ClientId'] = CHF_Labels_2['ClientId'] 
CHF_Labels['Label'] = CHF_Labels_2['ListNumber']

In [None]:
pivoted_data = CHF_data.drop('Agency', axis=1)

In [None]:
# Merge and preprocess data
data = pd.merge(pivoted_data, CHF_Labels, on='ClientId')
data['Label'] = data['Label'].map({'Trn': 0, 'Epi': 1, 'Chr': 2})  # Replace class1, class2, class3 with actual class names

In [None]:
# Split data
y_indices  = pd.DataFrame()
X = data.drop(['Label'], axis=1) #.values
y_indices ['Label'] = data['Label']
y_indices ['ClientId'] = data['ClientId']

# Split into train and test sets
X_train2, X_test, y_train2, y_test = train_test_split(
    X, 
    y_indices, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_indices['Label']  # This ensures the stratification
)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train2, 
    y_train2, 
    test_size=0.15, 
    random_state=42, 
    stratify=y_train2['Label']  # This ensures the stratification
) 

In [None]:
CHF_Labels_3A1 = pd.read_csv(f'CHF_Data_1/CHF_Labels_{LDays}.csv')
        

In [None]:
CHF_Labels_3A = pd.DataFrame()
CHF_Labels_3A['ClientId'] = CHF_Labels_3A1['ClientId'] 
CHF_Labels_3A['Label'] = CHF_Labels_3A1['ListNumber']

In [None]:
CHF_Labels_3B1 = pd.read_csv(f'CHF_Data_1/CHF_Labels_FL2_{LDays}.csv')

In [None]:
CHF_Labels_3B = pd.DataFrame()
CHF_Labels_3B['ClientId'] = CHF_Labels_3B1['Value'] 
CHF_Labels_3B['Label'] = CHF_Labels_3B1['Label']

In [None]:
# Step 1: Split ClientId in CHF_Labels_3B to extract IntegerA
CHF_Labels_3B['IntegerA'] = CHF_Labels_3B['ClientId'].apply(lambda x: x.split('_')[0])

In [None]:
CHF_Labels_3A['ClientId'] = CHF_Labels_3A['ClientId'].astype(str)

In [None]:
merged_df = pd.merge(CHF_Labels_3B, CHF_Labels_3A.rename(columns={'Label': 'Label_3A'}), 
                     left_on='IntegerA', right_on='ClientId', how='left')

# Step 4: Update Label in CHF_Labels_3B with Label from CHF_Labels_3A where there's a match
CHF_Labels_3B['Label'] = merged_df['Label_3A']

# Step 5: Drop the temporary columns
CHF_Labels_3B.drop(['IntegerA'], axis=1, inplace=True)

In [None]:
CHF_Labels_4 = pd.DataFrame()
CHF_Labels_4 = CHF_Labels_3B

In [None]:
CHF_Labels_4['Label'] = CHF_Labels_4['Label'].map({'Trn': 0, 'Epi': 1, 'Chr': 2})  # Replace class1, class2, class3 with actual class names

In [None]:
y_test.reset_index(drop=True,inplace = True)

In [None]:
y_test['ClientId'] = y_test['ClientId'].astype(str)  # Convert to int
CHF_Labels_4['ClientId'] = CHF_Labels_4['ClientId'].astype(str)  # Ensure this is also int


In [None]:
# Now perform the merge with CHF_Labels_3
merged_df = y_test.merge(CHF_Labels_4[['ClientId', 'Label']], on='ClientId', how='left', suffixes=('', '_new'))


merged_df.dropna(subset=['Label'], inplace=True)

# Replace the 'Label' column in y_test_df with the 'Label' from CHF_Labels_3
y_test['Label'] = merged_df['Label_new']
y_test.dropna(subset=['Label'], inplace=True)

# Drop rows where 'Label' is NaN if any such rows are not required
#y_test_df.dropna(subset=['Label'], inplace=True)


In [None]:
X_test['ClientId'] = X_test['ClientId'].astype(str)  # Convert to string
y_test['ClientId'] = y_test['ClientId'].astype(str)  # Ensure this is also str

In [None]:
X_test = X_test.merge(y_test['ClientId'], on='ClientId', how='inner')
X_test = X_test.drop_duplicates(subset='ClientId', keep='first')
y_test = y_test.merge(X_test['ClientId'], on='ClientId', how='inner')
y_test = y_test.drop_duplicates(subset='ClientId', keep='first')

In [None]:
# Sort X_test by ClientId
X_test = X_test.sort_values(by='ClientId')

# Sort y_test by ClientId
y_test = y_test.sort_values(by='ClientId')

In [None]:
X_test = X_test.drop(['ClientId'], axis=1).values
X_train = X_train.drop(['ClientId'], axis=1).values
X_val = X_val.drop(['ClientId'], axis=1).values

In [None]:
# Normalize features
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)
X_train = scaler.fit_transform(X_train)
X_val = scaler.fit_transform(X_val)

In [None]:
y_test = y_test.drop(['ClientId'], axis=1).values
y_train = y_train.drop(['ClientId'], axis=1).values
y_val = y_val.drop(['ClientId'], axis=1).values

In [None]:
y_train = np.squeeze(y_train)
y_test = np.squeeze(y_test)
y_val = np.squeeze(y_val)

In [None]:
class_weights_tensor = torch.tensor(np.array([0.75, 7, 6]), dtype=torch.float32)

In [None]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train).float()
y_train_tensor = torch.tensor(y_train).long()

X_test_tensor = torch.tensor(X_test).float()
y_test_tensor = torch.tensor(y_test).long()

X_val_tensor = torch.tensor(X_val).float()
y_val_tensor = torch.tensor(y_val).long()

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Create DataLoaders
# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

# Create DataLoaders
batch_size = 2048
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True)
#test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
def calculate_recall(outputs, labels, num_classes):
    _, preds = torch.max(outputs, 1)
    correct = preds.eq(labels.view_as(preds))

    recall_per_class = []
    for i in range(num_classes):
        correct_class = correct[labels == i]
        recall_class = torch.mean(correct_class.float()) if correct_class.numel() > 0 else torch.tensor(0)
        recall_per_class.append(recall_class.item())

    return recall_per_class


In [None]:
import copy
def early_stopping_check(epoch_val_loss, best_val_loss, best_model_weights, model, patience_counter, patience=10):
    if epoch_val_loss < best_val_loss:
        best_val_loss = epoch_val_loss
        best_model_weights = copy.deepcopy(model.state_dict())
        patience_counter = 0
    else:
        patience_counter += 1

    stop_training = False
    if patience_counter >= patience:
        stop_training = True

    return stop_training, best_val_loss, best_model_weights, patience_counter


In [None]:

import copy
import matplotlib.pyplot as plt
import torch.optim as optim
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Dropout(0.50),
            nn.Linear(256, 128),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Dropout(0.50),
            nn.Linear(128, 8),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Dropout(0.50),
            nn.Linear(8, num_classes)
        )
        
    def forward(self, x):
        return self.layers(x)

# Assuming the data loaders and class_weights_tensor are set up correctly
# Hyperparameters
input_size = X_train.shape[1]  # Number of features
num_classes = 3  # Example number of classes
batch_size = 250
learning_rate = 0.0001
num_epochs = 200  # Number of epochs for training
num_runs = 10  # Number of runs

# Function to train and evaluate the model
def train_and_evaluate_model(model, train_loader, test_loader, criterion, optimizer):
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluate the model
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    cm = confusion_matrix(all_labels, all_preds)
    precision = np.divide(cm.diagonal(), cm.sum(axis=0), out=np.zeros_like(cm.diagonal(), dtype=float), where=cm.sum(axis=0)!=0)
    recall = np.divide(cm.diagonal(), cm.sum(axis=1), out=np.zeros_like(cm.diagonal(), dtype=float), where=cm.sum(axis=1)!=0)
    return np.nanmean(precision), np.nanmean(recall), precision, recall

# Initialize sums and arrays for averages
sum_macro_avg_precision = 0
sum_macro_avg_recall = 0
sum_class_precision = np.zeros(num_classes)
sum_class_recall = np.zeros(num_classes)

# Main loop to train and evaluate the model multiple times
for _ in range(num_runs):
    model = MLP(input_size, num_classes)
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    macro_avg_precision, macro_avg_recall, class_precision, class_recall = train_and_evaluate_model(model, train_loader, test_loader, criterion, optimizer)
    sum_macro_avg_precision += macro_avg_precision
    sum_macro_avg_recall += macro_avg_recall
    sum_class_precision += class_precision
    sum_class_recall += class_recall

# Calculate averages
average_macro_avg_precision = sum_macro_avg_precision / num_runs
average_macro_avg_recall = sum_macro_avg_recall / num_runs
average_class_precision = sum_class_precision / num_runs
average_class_recall = sum_class_recall / num_runs

print(f"Average Macro Average Precision: {average_macro_avg_precision * 100:.2f}%")
print(f"Average Macro Average Recall: {average_macro_avg_recall * 100:.2f}%")
for i in range(num_classes):
    precision_str = f"{average_class_precision[i] * 100:.2f}%" if not np.isnan(average_class_precision[i]) else "0.00%"
    recall_str = f"{average_class_recall[i] * 100:.2f}%"
    print(f"Average Precision for Class {i}: {precision_str}")
    print(f"Average Recall for Class {i}: {recall_str}")

In [None]:
# Create a dictionary for DataFrame
data = {
    "Model Type": [model_type],
    "Agency": [Agency_no],
    "Windows": [windows],
    "Label Days": [label_days],
    "Prediction Days": [prediction_days],
    "Macro Average Precision": [f"{average_macro_avg_precision * 100:.2f}%"],
    "Macro Average Recall": [f"{average_macro_avg_recall * 100:.2f}%"],
}

# Adding per-class precision and recall with handling for NaN values
for i in range(num_classes):
    precision_str = f"{average_class_precision[i] * 100:.2f}%" if not np.isnan(average_class_precision[i]) else "0.00%"
    recall_str = f"{average_class_recall[i] * 100:.2f}%" if not np.isnan(average_class_recall[i]) else "0.00%"
    data[f"Precision Class {i}"] = [precision_str]
    data[f"Recall Class {i}"] = [recall_str]

# Creating a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame to verify
print(df)

# Save the DataFrame to a CSV file
filename = "NRes/Model_Results_Local_3.csv"
df.to_csv(filename, index=False)