In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import datetime

In [12]:
def log_to_file(message, filename="output.txt"):
    with open(filename, "a") as file:
        file.write(message + "\n")

In [13]:
# Generator model definition
class Generator(nn.Module):
    print("Inside Generator")
    def __init__(self, latent_dim, input_dim, num_classes):
        super(Generator, self).__init__()
        self.label_embedding = nn.Embedding(num_classes, latent_dim)
        
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(True),
            nn.BatchNorm1d(128, momentum=0.8),
            nn.Linear(128, 256),
            nn.ReLU(True),
            nn.BatchNorm1d(256, momentum=0.8),
            nn.Linear(256, input_dim),
            nn.Tanh()
        )

    def forward(self, noise, labels):
        gen_input = torch.mul(self.label_embedding(labels), noise)
        data = self.model(gen_input)
        return data

Inside Generator


In [14]:
import torch.nn.functional as F

In [15]:
class Discriminator(nn.Module):
    print("Inside Discriminator")
    log_to_file("Inside Discriminator")

    def __init__(self, input_dim, num_classes):
        super(Discriminator, self).__init__()
        
        self.model = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.25),
            nn.Conv1d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.25),
            nn.BatchNorm1d(32),  # Use BatchNorm instead of SyncBatchNorm for CPU
            nn.Conv1d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.25),
            nn.BatchNorm1d(64),  # Use BatchNorm instead of SyncBatchNorm for CPU
            nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.25),
            nn.Flatten()
        )

        final_conv_size = self._get_conv_output(input_dim)
        
        self.validity_output = nn.Linear(final_conv_size, 1)
        self.label_output = nn.Linear(final_conv_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def _get_conv_output(self, shape):
        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, shape)
            output = self.model(dummy_input)
            return int(np.prod(output.size()))

    def forward(self, data):
        features = self.model(data)
        validity = torch.sigmoid(self.validity_output(features))
        label = self.softmax(self.label_output(features))
        return validity, label

Inside Discriminator


In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [17]:
class ACGAN():
    def __init__(self, input_dim, latent_dim, num_classes, rank):
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.num_classes = num_classes

        # Initialize lists to store synthetic data and labels
        self.synthetic_data = []
        self.synthetic_labels = []

        self.generator = Generator(latent_dim, input_dim, num_classes)
        self.discriminator = Discriminator(input_dim, num_classes)

        # No GPU, so just use the models as they are
        self.generator = self.generator.to('cpu')
        self.discriminator = self.discriminator.to('cpu')

        self.adversarial_loss = nn.BCELoss()
        self.classifier_loss = nn.CrossEntropyLoss()

        self.optimizer_G = optim.Adam(self.generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
        self.optimizer_D = optim.Adam(self.discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

    def train(self, X_train, y_train, epochs, batch_size, sample_interval, validation_data, rank, minority_samples=2000):
        print("Inside Train")
        log_to_file("Starting Train...")
        torch.autograd.set_detect_anomaly(True)

        X_train = X_train[:, np.newaxis, :]  # Add channel dimension
        X_val, y_val = validation_data
        X_val = X_val[:, np.newaxis, :]
        print("Inside Train")
        train_dataset = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).long())
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        val_dataset = TensorDataset(torch.tensor(X_val).float(), torch.tensor(y_val).long())
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        print("Inside Train")
        for epoch in range(epochs):
            train_g_loss, train_d_loss = 0, 0
            train_correct, train_total = 0, 0
            self.generator.train()
            self.discriminator.train()

            for i, (data, labels) in enumerate(train_loader):
                data = data.to('cpu')
                labels = labels.to('cpu')
                batch_size = data.size(0)
                valid = torch.ones(batch_size, 1, device='cpu')
                fake = torch.zeros(batch_size, 1, device='cpu')

                real_data = data

                # Initialize g_loss and gen_data
                g_loss = torch.tensor(0.0, device='cpu')
                gen_data = None

                # Train Generator for Minority Class
                if torch.sum(labels == 1) > 0:
                    self.optimizer_G.zero_grad()

                    z = torch.randn(minority_samples, self.latent_dim, device='cpu')
                    gen_labels = torch.ones(minority_samples, device='cpu').long()

                    gen_data = self.generator(z, gen_labels)
                    gen_data = gen_data.unsqueeze(1)  # Shape: [minority_samples, 1, num_features]

                    self.synthetic_data.append(gen_data.detach().cpu().numpy())
                    self.synthetic_labels.append(gen_labels.detach().cpu().numpy())

                    # Adjust the valid tensor size to match the generated data size
                    valid_fake = torch.ones(gen_data.size(0), 1, device='cpu')

                    validity, pred_label = self.discriminator(gen_data)
                    g_loss = self.adversarial_loss(validity, valid_fake) + \
                             self.classifier_loss(pred_label, gen_labels)
                    g_loss.backward()
                    self.optimizer_G.step()

                # Train Discriminator
                self.optimizer_D.zero_grad()

                validity_real, label_real = self.discriminator(real_data)
                valid = torch.ones(validity_real.size(0), 1, device='cpu')  # Adjust the size of 'valid' to match 'validity_real'
                d_real_loss = self.adversarial_loss(validity_real, valid) + self.classifier_loss(label_real, labels)

                d_loss = d_real_loss  # Start with real data loss

                # If gen_data was generated, include it in the discriminator loss
                if gen_data is not None:
                    validity_fake, label_fake = self.discriminator(gen_data.detach())
                    fake_fake = torch.zeros(validity_fake.size(0), 1, device='cpu')  # Adjust the size of 'fake_fake' to match 'validity_fake'
                    d_fake_loss = self.adversarial_loss(validity_fake, fake_fake) + \
                                 self.classifier_loss(label_fake, gen_labels)
                    d_loss = (d_real_loss + d_fake_loss) / 2

                d_loss.backward()
                self.optimizer_D.step()

                train_d_loss += d_loss.item()
                train_g_loss += g_loss.item()

                _, predicted_labels = torch.max(label_real, 1)
                train_total += labels.size(0)
                train_correct += (predicted_labels == labels).sum().item()

            train_d_loss /= len(train_loader)
            train_g_loss /= len(train_loader)
            train_accuracy = 100. * train_correct / train_total


            val_correct, val_total, val_d_loss = 0, 0, 0
            self.generator.eval()
            self.discriminator.eval()

            with torch.no_grad():
                for val_data, val_labels in val_loader:
                    val_data = val_data.to('cpu')
                    val_labels = val_labels.to('cpu')

                    validity_real, label_real = self.discriminator(val_data)
                    valid = torch.ones(validity_real.size(0), 1, device='cpu')  # Adjust the size of 'valid' to match 'validity_real'
                    d_real_loss = self.adversarial_loss(validity_real, valid) + \
                                 self.classifier_loss(label_real, val_labels)
                    val_d_loss += d_real_loss.item()

                    _, predicted_labels = torch.max(label_real, 1)
                    val_total += val_labels.size(0)
                    val_correct += (predicted_labels == val_labels).sum().item()

            val_d_loss /= len(val_loader)
            val_accuracy = 100. * val_correct / val_total

            print(f"Rank {rank} | Epoch {epoch+1}/{epochs} | "
                  f"Train Loss: D={train_d_loss:.4f}, G={train_g_loss:.4f} | "
                  f"Train Accuracy: {train_accuracy:.2f}% | "
                  f"Val Loss: D={val_d_loss:.4f} | Val Accuracy: {val_accuracy:.2f}%")
            
            # log_to_file(f"Rank {rank} | Epoch {epoch+1}/{epochs} | "
            #         f"Train Loss: D={train_d_loss:.4f}, G={train_g_loss:.4f} | "
            #         f"Train Accuracy: {train_accuracy:.2f}% | "
            #         f"Val Loss: D={val_d_loss:.4f} | Val Accuracy: {val_accuracy:.2f}%")


    def test(self, X_test, y_test, batch_size, rank):
        print("Inside Test")
        log_to_file("Inside Test...")
        X_test = X_test[:, np.newaxis, :]  # Add channel dimension
        test_dataset = TensorDataset(torch.tensor(X_test).float(), torch.tensor(y_test).long())
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        test_correct, test_total = 0, 0
        test_d_loss = 0

        self.generator.eval()
        self.discriminator.eval()

        all_labels = []
        all_predictions = []

        with torch.no_grad():
            for test_data, test_labels in test_loader:
                test_data = test_data.to('cpu')
                test_labels = test_labels.to('cpu')

                validity_real, label_real = self.discriminator(test_data)
                valid = torch.ones(validity_real.size(0), 1, device='cpu')  # Adjust the size of 'valid' to match 'validity_real'
                d_real_loss = self.adversarial_loss(validity_real, valid) + self.classifier_loss(label_real, test_labels)

                test_d_loss += d_real_loss.item()

                _, predicted_labels = torch.max(label_real, 1)
                test_total += test_labels.size(0)
                test_correct += (predicted_labels == test_labels).sum().item()

                all_labels.append(test_labels.numpy())
                all_predictions.append(predicted_labels.numpy())

        test_d_loss /= len(test_loader)
        test_accuracy = 100. * test_correct / test_total

        # Flatten the lists of labels and predictions
        all_labels = np.concatenate(all_labels)
        all_predictions = np.concatenate(all_predictions)

        # Calculate Precision, Recall, and F1-Score for the minority class (Bankrupt, class 1)
        precision = precision_score(all_labels, all_predictions, pos_label=1)
        recall = recall_score(all_labels, all_predictions, pos_label=1)
        f1 = f1_score(all_labels, all_predictions, pos_label=1)

        # log_to_file(f"Rank {rank} | Epoch {epoch+1}/{epochs} | "
        #             f"Train Loss: D={train_d_loss:.4f}, G={train_g_loss:.4f} | "
        #             f"Train Accuracy: {train_accuracy:.2f}% | "
        #             f"Val Loss: D={val_d_loss:.4f} | Val Accuracy: {val_accuracy:.2f}%")



In [18]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit

In [19]:
def main_worker(rank, world_size, data_path):
    print("Inside Main Worker")
    log_to_file("Inside Main worker...")
    # Load and preprocess the dataset
    df = pd.read_excel(data_path)

    # Remove the first two non-numeric columns
    df = df.iloc[:, 2:]  # Assuming the first two columns are non-numeric and need to be removed

    # Ensure all remaining columns are numeric, coerce non-numeric data to NaN
    df = df.apply(pd.to_numeric, errors='coerce')

    # Handle missing values by filling NaN with the mean of each column
    df.fillna(df.mean(), inplace=True)

    X = df.drop(columns=['BankRupt']).values
    y = df['BankRupt'].values

    # Normalize data
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    print("Scaling Complete")

    # Separate the minority and majority classes
    X_minority = X[y == 1]
    X_majority = X[y == 0]
    y_minority = y[y == 1]
    y_majority = y[y == 0]

    # Split the minority class first into 60:20:20
    X_train_min, X_temp_min, y_train_min, y_temp_min = train_test_split(X_minority, y_minority, test_size=0.4, random_state=42)
    X_val_min, X_test_min, y_val_min, y_test_min = train_test_split(X_temp_min, y_temp_min, test_size=0.5, random_state=42)
    
    # Now split the majority class
    X_train_maj, X_temp_maj, y_train_maj, y_temp_maj = train_test_split(X_majority, y_majority, test_size=0.4, random_state=42)
    X_val_maj, X_test_maj, y_val_maj, y_test_maj = train_test_split(X_temp_maj, y_temp_maj, test_size=0.5, random_state=42)
    print("Splitting Complete")
    # Combine the splits back together
    X_train = np.concatenate([X_train_min, X_train_maj])
    X_val = np.concatenate([X_val_min, X_val_maj])
    X_test = np.concatenate([X_test_min, X_test_maj])
    y_train = np.concatenate([y_train_min, y_train_maj])
    y_val = np.concatenate([y_val_min, y_val_maj])
    y_test = np.concatenate([y_test_min, y_test_maj])

    # Parameters
    input_dim = X.shape[1]  # Number of features in your dataset
    latent_dim = 100
    num_classes = 2  # Binary classification (0: Not Bankrupt, 1: Bankrupt)

    # Initialize ACGAN
    acgan = ACGAN(input_dim=input_dim, latent_dim=latent_dim, num_classes=num_classes, rank=rank)
    print("ACGAN initialized")
    # Train the model with control over the number of synthetic samples for the minority class
    acgan.train(X_train, y_train, epochs=1000, batch_size=16, sample_interval=200, validation_data=(X_val, y_val), rank=rank, minority_samples=10)
    print("Training Complete")


    # Combine synthetic data with original data
    synthetic_data = np.concatenate(acgan.synthetic_data, axis=0).squeeze()  # Remove extra channel dimension
    synthetic_labels = np.concatenate(acgan.synthetic_labels, axis=0)
    
    # Original data as DataFrames
    original_data = np.concatenate([X, y.reshape(-1, 1)], axis=1)
    original_df = pd.DataFrame(original_data, columns=df.columns)

    # Synthetic data as DataFrame
    synthetic_df = pd.DataFrame(np.concatenate([synthetic_data, synthetic_labels.reshape(-1, 1)], axis=1), columns=df.columns)

    # Combine original and synthetic data
    combined_df = pd.concat([original_df, synthetic_df], ignore_index=True)

    # Save combined dataset to an Excel file
    combined_df.to_csv("CombinedDataset_with_SyntheticData.csv", index=False)
    print("Final dataset with synthetic data saved as CombinedDataset_with_SyntheticData.csv")




    # Test the model
    acgan.test(X_test, y_test, batch_size=64, rank=rank)
    print("Testing Complete")

In [21]:
if __name__ == "__main__":
    main_worker(rank=0, world_size=1, data_path='CompletedDataset.xlsx')

Inside Main Worker
Scaling Complete
Splitting Complete
ACGAN initialized
Inside Train
Inside Train
Inside Train
Rank 0 | Epoch 1/1000 | Train Loss: D=0.6082, G=0.4820 | Train Accuracy: 97.40% | Val Loss: D=0.4437 | Val Accuracy: 97.77%
Rank 0 | Epoch 2/1000 | Train Loss: D=0.3659, G=0.9990 | Train Accuracy: 97.54% | Val Loss: D=0.8979 | Val Accuracy: 96.23%
Rank 0 | Epoch 3/1000 | Train Loss: D=0.3407, G=1.3505 | Train Accuracy: 97.94% | Val Loss: D=0.3461 | Val Accuracy: 97.73%
Rank 0 | Epoch 4/1000 | Train Loss: D=0.3351, G=1.6022 | Train Accuracy: 98.04% | Val Loss: D=0.3679 | Val Accuracy: 97.46%
Rank 0 | Epoch 5/1000 | Train Loss: D=0.3361, G=1.7717 | Train Accuracy: 98.03% | Val Loss: D=0.3600 | Val Accuracy: 97.64%
Rank 0 | Epoch 6/1000 | Train Loss: D=0.3397, G=1.7423 | Train Accuracy: 98.02% | Val Loss: D=0.3451 | Val Accuracy: 97.83%
Rank 0 | Epoch 7/1000 | Train Loss: D=0.3474, G=1.6998 | Train Accuracy: 97.87% | Val Loss: D=0.4187 | Val Accuracy: 97.97%
Rank 0 | Epoch 8/100

  _warn_prf(average, modifier, msg_start, len(result))


NameError: name 'epoch' is not defined