In [None]:
!nvidia-smi

# seed

In [None]:
import random
import numpy as np
import torch


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


seed = 11032006
set_seed(seed)


In [None]:
import torch

print(torch.cuda.is_available())

# define dataset for model

In [None]:
import csv
import numpy as np
from torch.utils.data import Dataset
import time
import os


class GoDataset(Dataset):
    def __init__(self, path_of_data, length):
        """
        Initializes the GoDataset with the given CSV file path.
        Args:
            path (str): Path to the CSV file containing Go game data.
        """
        super().__init__()
        self.path = path_of_data
        self.preprocessed_path = "data/preprocessed data"
        self.length = length
        self.char2idx = {c: i for i, c in enumerate("abcdefghijklmnopqrs")}
        self.dir_len = len(os.listdir('data/preprocessed data'))

        # Load data from CSV file
        with open(self.path, newline="") as csvfile:
            reader = csv.reader(csvfile, delimiter=",")
            # Read row by row
            self.data = list(reader)  # dtype: list[str]

    def __read_from_file(self, row):
        random_start = np.random.randint(2, len(row) - self.length)
        boards = []
        for step in range(random_start, random_start + self.length):
            board = torch.zeros((19, 19, 2), dtype=torch.float32)
            dim = 0 if row[step][0] == "B" else 1
            x = self.char2idx[row[step][2]]
            y = self.char2idx[row[step][3]]
            board[x, y, dim] = 1
            boards.append(board)
        boards = torch.stack(boards)

        dim = 0 if row[random_start + self.length][0] == "B" else 1
        x = self.char2idx[row[random_start + self.length][2]]
        y = self.char2idx[row[random_start + self.length][3]]
        label = torch.tensor([dim * 361 + x * 19 + y], dtype=torch.float32)

        return boards, label

    def __len__(self):
        """
        Returns the number of samples in the dataset.
        Returns:
            int: Number of samples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        Get data at the given index.
        Args:
            idx (int): Index of the data sample.
        Returns:
            torch.Tensor: Processed and padded data sample.
        """
        # Get data at the given index
        row = self.data[idx]

        # Transform data into a board
        processed_data, label = self.__read_from_file(row)
        return processed_data, label

# conformer

In [None]:
import torch
import torch.nn as nn


class ConvModule(nn.Module):
    """
    Conformer convolution module.

    Args:
        input_dim (int): input dimension.
        num_channels (int): number of depthwise convolution layer input channels.
        depthwise_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``)
        use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``)
    """

    def __init__(
        self,
        input_dim: int,
        num_channels: int,
        depthwise_kernel_size: int,
        dropout: float = 0.0,
        bias: bool = False,
        use_group_norm: bool = False,
    ) -> None:
        super().__init__()
        if (depthwise_kernel_size - 1) % 2 != 0:
            raise ValueError(
                "depthwise_kernel_size must be odd to achieve 'SAME' padding."
            )

        # Layer normalization for input
        self.layer_norm = nn.LayerNorm(input_dim)

        # Sequential layers: 1x1 Conv, GLU, Depthwise Conv, Normalization, Activation, 1x1 Conv, Dropout
        self.sequential = nn.Sequential(
            nn.Conv1d(
                input_dim,
                2 * num_channels,
                1,
                stride=1,
                padding=0,
                bias=bias,
            ),
            nn.GLU(dim=1),
            nn.Conv1d(
                num_channels,
                num_channels,
                depthwise_kernel_size,
                stride=1,
                padding=(depthwise_kernel_size - 1) // 2,
                groups=num_channels,
                bias=bias,
            ),
            nn.GroupNorm(num_groups=1, num_channels=num_channels)
            if use_group_norm
            else nn.BatchNorm1d(num_channels),
            nn.SiLU(),  # SiLU activation function (Sigmoid Linear Unit)
            nn.Conv1d(
                num_channels,
                input_dim,
                kernel_size=1,
                stride=1,
                padding=0,
                bias=bias,
            ),
            nn.Dropout(dropout),
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the Conformer convolution module.

        Args:
            input (torch.Tensor): Input tensor with shape `(B, T, D)`.
            B: Batch size, T: Sequence length, D: Input dimension

        Returns:
            torch.Tensor: Output tensor with shape `(B, T, D)`.
        """
        x = self.layer_norm(input)
        # Transpose to shape `(B, D, T)` for 1D convolutions
        x = x.transpose(1, 2)
        x = self.sequential(x)  # Apply sequential layers
        return x.transpose(1, 2)  # Transpose back to shape `(B, T, D)`


class FeedForwardModule(nn.Module):
    """
    Feedforward module with Layer Normalization, Linear layers, SiLU activation, and Dropout.

    Args:
        input_dim (int): Input dimension.
        hidden_dim (int): Hidden layer dimension.
        dropout (float, optional): Dropout probability. (Default: 0.1)
    """

    def __init__(self, input_dim, hidden_dim, dropout=0.1):
        super(FeedForwardModule, self).__init__()
        self.module = nn.Sequential(
            nn.LayerNorm(input_dim),
            nn.Linear(input_dim, hidden_dim),
            nn.SiLU(),  # SiLU activation function (Sigmoid Linear Unit)
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, input_dim),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        """
        Forward pass of the FeedForwardModule.

        Args:
            x (torch.Tensor): Input tensor with shape `(B, T, D)`.

        Returns:
            torch.Tensor: Output tensor with the same shape as the input tensor.
        """
        return self.module(x)


class ConformerBlock(nn.Module):
    """
    Conformer layer that constitutes Conformer.

    Args:
        input_dim (int): input dimension.
        ffn_dim (int): hidden layer dimension of the feedforward network.
        num_attention_heads (int): number of attention heads.
        depthwise_conv_kernel_size (int): kernel size of the depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.1)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)
    """

    def __init__(
        self,
        input_dim,
        ffn_dim,
        num_attention_heads,
        depthwise_conv_kernel_size,
        dropout=0.1,
        use_group_norm=False,
        convolution_first=False,
    ):
        super().__init__()
        self.ffn1 = FeedForwardModule(input_dim, ffn_dim, dropout)
        self.ffn2 = FeedForwardModule(input_dim, ffn_dim, dropout)
        self.conv = ConvModule(
            input_dim,
            input_dim,
            depthwise_conv_kernel_size,
            dropout,
            use_group_norm=use_group_norm,
        )
        self.self_attn = nn.MultiheadAttention(
            input_dim, num_attention_heads, dropout=dropout
        )
        self.self_attn_dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(input_dim)
        self.convolution_first = convolution_first

    def __apply_conv(self, x):
        """
        Apply the convolution module.

        Args:
            x (torch.Tensor): Input tensor with shape `(T, B, D)`.

        Returns:
            torch.Tensor: Output tensor after applying the convolution module.
        """
        residual = x
        # Transpose to shape `(B, T, D)` for 1D convolutions
        x = x.transpose(0, 1)
        x = self.conv(x)
        x = x.transpose(0, 1)  # Transpose back to shape `(T, B, D)`
        x = x + residual
        return x

    def forward(self, x):
        """
        Forward pass of the ConformerBlock.

        Args:
            x (torch.Tensor): Input tensor with shape `(T, B, D)`.

        Returns:
            torch.Tensor: Output tensor with the same shape as the input tensor.
        """
        residual = x
        x = self.ffn1(x)  # First feedforward module
        x = 0.5 * x + residual  # Residual connection and scaling

        if self.convolution_first:
            x = self.__apply_conv(x)  # Apply convolution module if specified

        residual = x
        x = self.layer_norm(x)  # Layer normalization
        x, _ = self.self_attn(x, x, x)  # Multihead self-attention
        x = self.self_attn_dropout(x)
        x = x + residual  # Residual connection

        if not self.convolution_first:
            x = self.__apply_conv(x)  # Apply convolution module if specified

        residual = x
        x = self.ffn2(x)  # Second feedforward module
        x = 0.5 * x + residual  # Residual connection and scaling
        x = self.layer_norm(x)  # Final layer normalization
        return x


class Conformer(nn.Module):
    """
    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Conformer layer.
        ffn_dim (int): hidden layer dimension of feedforward networks.
        num_layers (int): number of Conformer layers to instantiate.
        depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.1)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)
    """

    def __init__(
        self,
        input_dim,
        num_heads,
        ffn_dim,
        num_layers,
        depthwise_conv_kernel_size,
        dropout=0.1,
        use_group_norm=False,
        convolution_first=False,
    ):
        super().__init__()

        # Instantiate Conformer blocks
        self.conformer_blocks = nn.ModuleList(
            [
                ConformerBlock(
                    input_dim,
                    ffn_dim,
                    num_heads,
                    depthwise_conv_kernel_size,
                    dropout,
                    use_group_norm,
                    convolution_first,
                )
                for _ in range(num_layers)
            ]
        )

    def forward(self, x: torch.Tensor):
        """
        Forward pass of the Generator (Conformer model).

        Args:
            x (torch.Tensor): input with shape `(B, T, input_dim)`.

        Returns:
            torch.Tensor: output with shape `(B, T, input_dim)`.
        """
        batch_size, seq_length, _, _, _ = x.shape
        x = x.view(batch_size, seq_length, -1)  # Flatten input tensor

        x = x.transpose(0, 1)  # Transpose to shape `(T, B, input_dim)`

        # Pass input through Conformer blocks
        for layer in self.conformer_blocks:
            x = layer(x)

        x = x.transpose(0, 1)  # Transpose back to shape `(B, T, input_dim)`

        return x


# generator

In [None]:
import torch
import torch.nn as nn


class Generator(nn.Module):
    """
    Generator model using Conformer architecture.

    Args:
        input_dim (int): Input dimension.
        num_heads (int): Number of attention heads in each Conformer layer.
        ffn_dim (int): Hidden layer dimension of feedforward networks in Conformer layers.
        num_layers (int): Number of Conformer layers.
        depthwise_conv_kernel_size (int): Kernel size of depthwise convolution in Conformer layers.
        dropout (float, optional): Dropout probability. (Default: 0.1)
        use_group_norm (bool, optional): Use GroupNorm instead of BatchNorm1d in Conformer layers. (Default: False)
        convolution_first (bool, optional): Apply convolution module ahead of attention module. (Default: False)
    """

    def __init__(
        self,
        input_dim,
        num_heads,
        ffn_dim,
        num_layers,
        depthwise_conv_kernel_size,
        dropout=0.1,
        use_group_norm=False,
        convolution_first=False,
    ):
        super(Generator, self).__init__()

        # Instantiate the Conformer module
        self.conformer = Conformer(
            input_dim,
            num_heads,
            ffn_dim,
            num_layers,
            depthwise_conv_kernel_size,
            dropout,
            use_group_norm,
            convolution_first,
        )


        self.output_layer = nn.Sequential(
            nn.Linear(input_dim, 1),
        )

    def forward(self, x: torch.Tensor):
        """
        Forward pass of the Generator (Conformer model).

        Args:
            x (torch.Tensor): Input tensor with shape `(B, T, input_dim)`.

        Returns:
            torch.Tensor: Output tensor with shape `(B, output_dim)`.
        """
        # Pass the input through the Conformer layers
        conformer_output = self.conformer(x)

        # truncate the output to the last time step
        output = conformer_output[:, -1, :]

        # Pass the output through the linear layer
        output = self.output_layer(output)

        return output


# discriminator

In [None]:
import torch
import torch.nn as nn


class Discriminator(nn.Module):
    """
    Discriminator model using Conformer and PredEncoder architectures.

    Args:
        input_dim (int): Input dimension for Conformer and PredEncoder.
        num_heads (int): Number of attention heads in each Conformer layer.
        ffn_dim (int): Hidden layer dimension of feedforward networks in Conformer.
        num_layers (int): Number of Conformer layers.
        depthwise_conv_kernel_size (int): Kernel size of depthwise convolution in Conformer.
        dropout (float, optional): Dropout probability. (Default: 0.1)
        use_group_norm (bool, optional): Use GroupNorm instead of BatchNorm1d in Conformer layers. (Default: False)
        convolution_first (bool, optional): Apply convolution module ahead of attention module. (Default: False)
    """

    def __init__(
        self,
        input_dim,
        num_heads,
        ffn_dim,
        num_layers,
        depthwise_conv_kernel_size,
        dropout=0.1,
        use_group_norm=False,
        convolution_first=False,
    ):
        super(Discriminator, self).__init__()

        # Instantiate the Generator (Conformer) module
        self.generator = Generator(
            input_dim,
            num_heads,
            ffn_dim,
            num_layers,
            depthwise_conv_kernel_size,
            dropout,
            use_group_norm,
            convolution_first,
        )

        # TODO: try different activation functions
        # Linear layers for final classification
        self.linear = nn.Sequential(
            # Concatenate Conformer output and PredEncoder output
            # Output dimension reduced by half
            nn.Linear(2, 1),
            nn.Tanh(),
        )

    def forward(self, x, y):
        """
        Forward pass of the Discriminator.

        Args:
            x (torch.Tensor): Input tensor with shape `(B, T, input_dim)` (for Conformer).
            y (torch.Tensor): Input tensor with shape `(B, height * width)` (for PredEncoder).

        Returns:
            torch.Tensor: Output tensor with shape `(B, 2)` (binary classification result).
        """
        # Pass the input through the Conformer (Generator) layers and get the input copy
        generator_output = self.generator(x)

        # Concatenate Conformer output, PredEncoder output, and the original input tensor
        concatenated_input = torch.cat(
            (generator_output, y), dim=1
        )

        # Apply linear transformation for final classification
        output = self.linear(concatenated_input)

        return output

# trainer

In [None]:
from tqdm import tqdm
from torch.utils.data import DataLoader
import gc
import datetime
from IPython.display import clear_output


class Trainer:
    def __init__(
        self,
        config: dict,
        train_loader: DataLoader,
        val_loader: DataLoader,
        load_model: bool = False,
        from_epoch: int = 0,
        G_path: str = None,
        D_path: str = None,
    ):
        self.config = config
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.from_epoch = from_epoch

        if load_model:
            self.gen = torch.load(G_path)
            self.dis = torch.load(D_path)
        else:
            self.gen = Generator(
                input_dim=config["input_dim"],
                num_heads=config["num_heads"],
                ffn_dim=config["ffn_dim"],
                num_layers=config["num_layers"],
                depthwise_conv_kernel_size=config["depthwise_conv_kernel_size"],
                dropout=config["dropout"],
                use_group_norm=config["use_group_norm"],
                convolution_first=config["convolution_first"],
            )
            self.dis = Discriminator(
                input_dim=config["input_dim"],
                num_heads=config["num_heads"],
                ffn_dim=config["ffn_dim"],
                num_layers=config["num_layers"],
                depthwise_conv_kernel_size=config["depthwise_conv_kernel_size"],
                dropout=config["dropout"],
                use_group_norm=config["use_group_norm"],
                convolution_first=config["convolution_first"],
            )
        self.gen.to(self.config["device"])
        self.dis.to(self.config["device"])

        self.G_optimizer = torch.optim.Adam(self.gen.parameters(), lr=config["lr"])
        self.G_normal_optimizer = torch.optim.Adam(
            self.gen.parameters(), lr=config["lr"]
        )
        self.D_optimizer = torch.optim.Adam(self.dis.parameters(), lr=config["lr"])
        self.G_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.G_optimizer)
        self.G_normal_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.G_normal_optimizer
        )
        self.D_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.D_optimizer)

        self.criterion = nn.MSELoss()

        self.clip_value = config["clip_value"]

        self.early_count = 0
        self.best_val_loss = float("inf")

    def normal_evaluate_G(self, G_val_losses: list):
        self.gen.eval()
        for i, (x, y) in enumerate(tqdm(self.val_loader)):
            x = x.to(self.config["device"])
            y = y.to(self.config["device"])
            output = self.gen(x)
            loss = self.criterion(output, y)
            G_val_losses.append(loss.item())

    def evaluate_G(self, G_val_losses: list, G_accs: list, epoch):
        print(f"Evaluating generator:")

        # Set the generator and discriminator in evaluation mode
        self.gen.eval()
        self.dis.eval()
        total_loss = 0
        total_correct = 0

        # Iterate through the validation loader
        for i, (x, y) in enumerate(tqdm(self.val_loader)):
            x = x.to(self.config["device"])
            y = y.to(self.config["device"])

            with torch.no_grad():
                # Generate fake data and conditioning information from the generator
                output = self.gen(x)

                # Pass fake data and conditioning information through the discriminator
                fake_pred = self.dis(x, output)

            # Determine the predicted classes for fake and real samples
            fake_indices = torch.argmax(fake_pred, dim=1)
            real_indices = torch.argmax(y, dim=1)

            # Count correct predictions
            correct = torch.sum(fake_indices == real_indices)
            total_correct += correct.item()

            # Compute generator loss for both the image output and the discriminator predictions
            loss = self.criterion(output, y) + -torch.mean(fake_pred)
            total_loss += loss.item()

        self.G_scheduler.step(total_loss)
        self.G_normal_scheduler.step(total_loss)

        # Calculate and store the average generator validation loss
        average_loss = total_loss / len(self.val_loader)
        G_val_losses.append(average_loss)

        # Calculate and store the validation accuracy
        accuracy = total_correct / len(self.val_loader.dataset)
        G_accs.append(accuracy)
        print(f"G Validation accuracy: {accuracy}")

        if average_loss < self.best_val_loss:
            self.best_val_loss = average_loss
            self.early_count = 0
        else:
            self.early_count += 1

        if (epoch + self.from_epoch) % 10 == 0:
            torch.save(
                self.gen,
                self.config["gen_path"]
                + "/"
                + str(self.config["selected"])
                + "_"
                + f'epoch{epoch + self.from_epoch}'
                + ".pth",
            )
            torch.save(
                self.dis,
                self.config["dis_path"]
                + "/"
                + str(self.config["selected"])
                + "_"
                + f'epoch{epoch + self.from_epoch}'
                + ".pth",
            )

    def evaluate_D(self, D_val_losses: list, D_accs: list):
        print(f"Evaluating discriminator:")

        # Set the generator and discriminator in evaluation mode
        self.gen.eval()
        self.dis.eval()

        total_loss = 0
        total_correct = 0
        total_fake_loss = 0
        total_real_loss = 0

        # Iterate through the validation loader
        for i, (x, y) in enumerate(tqdm(self.val_loader)):
            x = x.to(self.config["device"])
            y = y.to(self.config["device"])

            
            with torch.no_grad():
                # Generate fake data and conditioning information from the generator
                output = self.gen(x)
                # Pass fake data and conditioning information through the discriminator
                fake_pred = self.dis(x, output)
                real_pred = self.dis(x, y)

            total_correct += torch.sum(fake_pred < 0.5) + torch.sum(real_pred > 0.5)

            total_loss += -torch.mean(real_pred) + torch.mean(fake_pred)
            total_fake_loss += torch.mean(fake_pred)
            total_real_loss += 1 - torch.mean(real_pred)

        self.D_scheduler.step(total_loss)

        # Calculate and store the average discriminator validation loss
        average_loss = total_loss / len(self.val_loader)
        D_val_losses.append(average_loss.item())
        print(f"Discriminator loss: {average_loss}")
        print(f"Fake loss: {total_fake_loss / len(self.val_loader)}")
        print(f"Real loss: {total_real_loss / len(self.val_loader)}")

        # Calculate and store the validation accuracy
        accuracy = total_correct / len(self.val_loader.dataset) / 2
        D_accs.append(accuracy.item())
        print(f"D Validation accuracy: {accuracy}")

        return accuracy

    def normal_train_G(self, G_losses: list):
        self.gen.train()
        total_loss = 0
        for i, (x, y) in enumerate(tqdm(self.train_loader)):
            self.G_normal_optimizer.zero_grad()
            x = x.to(self.config["device"])
            y = y.to(self.config["device"])
            output = self.gen(x)
            loss = self.criterion(output, y)
            loss.backward()
            self.G_normal_optimizer.step()
            total_loss += loss.item()

        G_losses.append(total_loss / len(self.train_loader))
        print(f"Normal Generator loss: {total_loss / len(self.train_loader)}")

    def train_G(self, x, y):
        x = x.to(self.config["device"])

        # Generate fake data and conditioning information from the generator
        fake_output = self.gen(x)

        # Pass fake data and conditioning information through the discriminator
        fake_pred = self.dis(x, fake_output)

        # Total loss for the generator: discriminator loss + normal loss
        loss = -torch.mean(fake_pred)

        # Backpropagation and optimization step
        self.G_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.gen.parameters(), max_norm=self.clip_value)
        self.G_optimizer.step()

        return loss.item()

    def cal_gradient_penalty(self, real, fake, condition, lambda_gp=10):
        batch_size = real.size(0)
        alpha = torch.rand((batch_size, 1), dtype=torch.float32, device=real.device)

        # Interpolate between real and fake samples based on alpha
        interpolates = alpha * real + (1 - alpha) * fake
        interpolates = torch.autograd.Variable(interpolates, requires_grad=True)

        # Pass the interpolated samples through the discriminator
        disc_interpolates = self.dis(condition, interpolates)

        # Compute gradients of the interpolated samples with respect to inputs
        gradients = torch.autograd.grad(
            outputs=disc_interpolates,
            inputs=interpolates,
            grad_outputs=torch.ones_like(disc_interpolates),
            create_graph=True,
            retain_graph=True,
            only_inputs=True,
        )[0]

        # Flatten and calculate the norm of the gradients for each sample in the batch
        gradients = gradients.view(batch_size, -1)
        gradient_norm = gradients.norm(2, dim=1)

        # Calculate gradient penalty based on the Lipschitz constraint formula
        gradient_penalty = ((gradient_norm - 1) ** 2).mean()

        # Scale the gradient penalty by lambda_gp and add it to the loss
        return lambda_gp * gradient_penalty

    def train_D(self, x, y):
        # Move real data and labels to the specified device
        x = x.to(self.config["device"])
        y = y.to(self.config["device"])

        # Clone real data for the gradient penalty calculation
        real_output = torch.clone(y)

        # Generate fake data and conditioning information from the generator
        fake_output = self.gen(x)

        # Pass fake data and conditioning information through the discriminator
        fake_pred = self.dis(x, torch.clone(fake_output))
        real_pred = self.dis(x, torch.clone(y))

        # Calculate the gradient penalty
        gradient_penalty = self.cal_gradient_penalty(
            real_output, fake_output, x
        )

        # Calculate the total loss: -real + fake + gradient penalty
        loss = -torch.mean(real_pred) + torch.mean(fake_pred) + gradient_penalty

        # Backpropagation and optimization step
        self.D_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.dis.parameters(), max_norm=self.clip_value)
        self.D_optimizer.step()
        

        return loss.item()

    def train(self):
        G_losses = []
        G_val_losses = []
        G_accs = []
        D_losses = []
        D_val_losses = []
        D_accs = []

        D_acc = self.evaluate_D(D_val_losses, D_accs)
            
        for epoch in range(self.config["epochs"]):
            D_total_loss = 0
            G_total_loss = 0
            print(f'Epoch {epoch+1}/{self.config["epochs"]}')
            for i, (x, y) in enumerate(tqdm(self.train_loader)):
                if D_acc < 0.8:
                    D_loss = self.train_D(x, y)
                    D_total_loss += D_loss

                G_loss = self.train_G(x, y)

                G_total_loss += G_loss

            self.normal_train_G(G_losses)

            print(f"Discriminator loss: {D_total_loss / len(self.train_loader)}")
            D_losses.append(D_total_loss / len(self.train_loader))
            print(f"GAN Generator loss: {G_total_loss / len(self.train_loader)}")
            G_losses[-1] = G_total_loss / len(self.train_loader) + G_losses[-1]

            D_acc = self.evaluate_D(D_val_losses, D_accs)
            self.evaluate_G(G_val_losses, G_accs, epoch)

            gc.collect()
            torch.cuda.empty_cache()

            if epoch % 5 == 0:
                clear_output(wait=True)

            if self.early_count >= self.config["early_stop"]:
                break

        return {
            "G_losses": G_losses,
            "G_val_losses": G_val_losses,
            "G_accs": G_accs,
            "D_losses": D_losses,
            "D_val_losses": D_val_losses,
            "D_accs": D_accs,
        }

# parameter finder

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import gc

domain = {
    "input_dim": [2 * 19 * 19],
    "num_heads": [1, 2],
    "ffn_dim": [64, 128, 256, 512],
    "num_layers": [2, 4, 8],
    "depthwise_conv_kernel_size": [3, 5, 7],
    "dropout": [0, 0.1, 0.2, 0.3, 0.4],
    "use_group_norm": [True, False],
    "convolution_first": [True, False],
    "lr": [0.0001, 0.001, 0.01],
    "gen_path": ["data/models/gen.pth"],
    "dis_path": ["data/models/dis.pth"],
    "device": [torch.device("cuda" if torch.cuda.is_available() else "cpu")],
    "batch_size": [128],
    "clip_value": [1],
    "data_len": [4, 8, 16, 32],
    "epochs": [10],
    "early_stop": [5],
}

# print domain count
count = 1
for key, value in domain.items():
    count *= len(value)
print(f"Total combinations: {count}")


class ParmFinder:
    def __init__(self, domain: dict) -> None:
        # Initialize parameters and data structures
        self.best_ratio = float("inf")
        self.best_params = None
        self.G_parms = []
        self.G_train_losses = []
        self.G_ratios = []
        self.D_parms = []
        self.domain = domain
        self.max_iter = 500
        self.max_epoch = 5
        self.G_history_path = "data/G_history.csv"

    def __random_sample(self):
        # Randomly sample parameters from the given domain
        params = {}
        for key, value in self.domain.items():
            params[key] = np.random.choice(value)

        # print(f"Current params: {params}")

        goDataset = GoDataset("data/train/dan_train.csv", params["data_len"])
        train_len = int(0.8 * len(goDataset))
        val_len = len(goDataset) - train_len
        train_dataset, val_dataset = torch.utils.data.random_split(
            goDataset, [train_len, val_len]
        )
        self.train_loader = DataLoader(
            train_dataset, batch_size=int(params["batch_size"]), shuffle=True, pin_memory=True
        )
        self.val_loader = DataLoader(
            val_dataset, batch_size=int(params["batch_size"]), shuffle=False, pin_memory=True
        )

        return params

    def __save_G(self):
        # Save G_parms, G_train_losses, and G_ratios to a CSV file and best G model
        header = list(self.domain.keys()) + ["train_loss", "loss_ratio"]
        df = pd.DataFrame(self.G_parms, columns=header)
        df["train_loss"] = self.G_train_losses
        df["loss_ratio"] = self.G_ratios
        df.sort_values(by="loss_ratio", ascending=False, inplace=True)
        df.to_csv(self.G_history_path, index=False)

    def __evaluate_G(self, trainer: Trainer):
        # Evaluate generator performance over multiple epochs
        train_loss = 0
        loss_ratio = 0
        for epoch in range(self.max_epoch):
            G_losses = []
            G_val_losses = []
            trainer.normal_train_G(G_losses)
            trainer.normal_evaluate_G(G_val_losses)

            train_loss = np.mean(G_losses)
            val_loss = np.mean(G_val_losses)
            loss_ratio = train_loss / val_loss

            print(
                f"Epoch {epoch+1}/{self.max_epoch}: Train Loss: {train_loss}, Val Loss: {val_loss}, Loss Ratio: {loss_ratio}"
            )
        
        if loss_ratio < self.best_ratio:
            self.best_ratio = loss_ratio
            self.best_params = trainer.config
                
        self.G_parms.append(trainer.config)
        self.G_train_losses.append(train_loss)
        self.G_ratios.append(loss_ratio)
        self.__save_G()

    def find(self):
        # Iterate for a maximum number of iterations
        for _ in range(self.max_iter):
            params = self.__random_sample()
            trainer = Trainer(params, self.train_loader, self.val_loader)
            self.__evaluate_G(trainer)
            torch.cuda.empty_cache()
            gc.collect()

        return self.best_params, self.train_loader, self.val_loader

In [None]:
# parmFinder = ParmFinder(domain)
# parms, train_loader, val_loader = parmFinder.find()
# print(f"Best params: {parms}")

# main

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# selected = [20, 53, 54]

# df = pd.read_csv("train data.csv")

# statistics = []

# for i in selected:
#     parms = df.iloc[i].to_dict()
#     parms['selected'] = i
#     print(f"Selected params: {parms}")
#     goDataset = GoDataset("data/train/dan_train.csv", parms["data_len"])
#     train_len = int(0.8 * len(goDataset))
#     val_len = len(goDataset) - train_len
#     train_dataset, val_dataset = torch.utils.data.random_split(
#         goDataset, [train_len, val_len]
#     )
#     train_loader = DataLoader(
#         train_dataset,
#         batch_size=int(parms["batch_size"]),
#         shuffle=True,
#         pin_memory=True,
#     )
#     val_loader = DataLoader(
#         val_dataset, batch_size=int(parms["batch_size"]), shuffle=False, pin_memory=True
#     )
#     trainer = Trainer(parms, train_loader, val_loader)
#     statistic = trainer.train()
#     statistics.append(statistic)


#     torch.cuda.empty_cache()
#     gc.collect()

config = {
    "input_dim": 19 * 19 * 2,
    "num_heads": 2,
    "ffn_dim": 128,
    "num_layers": 1,
    "depthwise_conv_kernel_size": 3,
    "dropout": 0.1,
    "use_group_norm": False,
    "convolution_first": False,
    "lr": 0.0001,
    "gen_path": "data/models/gen",
    "dis_path": "data/models/dis",
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    # "device": torch.device("cpu"),
    "batch_size": 2048,
    "clip_value": 1,
    "data_len": 1,
    "epochs": 10,
    "early_stop": 200,
    "selected": 0
}

goDataset = GoDataset("data/train/dan_train.csv", config["data_len"])
train_len = int(0.8 * len(goDataset))
val_len = len(goDataset) - train_len
train_dataset, val_dataset = torch.utils.data.random_split(
    goDataset, [train_len, val_len]
)
train_loader = DataLoader(
    train_dataset,
    batch_size=int(config["batch_size"]),
    shuffle=True,
    pin_memory=True,
)
val_loader = DataLoader(
    val_dataset, batch_size=int(config["batch_size"]), shuffle=False, pin_memory=True
)
trainer = Trainer(config, train_loader, val_loader)
statistic = trainer.train()


In [None]:

for key, value in statistic.items():
    plt.plot(value, label=key)
    plt.legend()
    plt.savefig(f"plot/{key}.png")
    plt.clf()

In [None]:
# format: statistics = [
# {
#     "G_losses": G_losses,
#     "G_val_losses": G_val_losses,
#     "G_accs": G_accs,
#     "D_losses": D_losses,
#     "D_val_losses": D_val_losses,
#     "D_accs": D_accs,
# } for i in selected]

# plot G_val_losses and val_loss_ratios
# for i, statistic in enumerate(statistics):
#     G_accs = [acc.to("cpu") for acc in statistic["G_accs"]]
#     plt.plot(statistic["G_val_losses"], label=f"val_loss_{selected[i]}")
#     plt.plot(np.array(statistic["G_losses"])/np.array(statistic["G_val_losses"]), label=f"val_loss_ratio_{selected[i]}")
#     plt.plot(G_accs, label=f"val_acc_{selected[i]}")
# plt.xlabel("Epoch")
# plt.ylabel("Value")
# plt.legend()
# plt.savefig("result.png")
# # plt.show()

In [None]:
%reset -f