
# Deep Neural Network Experiments with PyTorch

This notebook demonstrates three different neural network architectures using PyTorch. We will generate a synthetic classification dataset and train three models of increasing complexity:

1. **Simple neural network** – a single hidden layer.
2. **Moderate network** – multiple hidden layers and dropout for regularization.
3. **Advanced network** – deeper architecture with more neurons and batch normalization.


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Load the same dataset used in the provided Machine_Learning notebook
# This file should exist relative to this notebook.
DATA_PATH = 'data/matches_ml_prematch_clean.csv'

# Read the CSV into a DataFrame
# If the file is missing, this will raise an exception. Ensure the dataset is present.
df = pd.read_csv(DATA_PATH)
print(f'Loaded dataset: {df.shape}')

# Define the target and features. The target column is named 'label'.
# Drop any identifier columns such as 'match_id' if present.
y = df['label'].astype(int)
X = df.drop(columns=['label'])
if 'match_id' in X.columns:
    X = X.drop(columns=['match_id'])

# Identify categorical columns (dtype == object or category) and encode them using one‑hot encoding.
cat_cols = X.select_dtypes(include=['object', 'category']).columns
if len(cat_cols) > 0:
    X = pd.get_dummies(X, columns=cat_cols)

# Standardize numerical features for better neural network training
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create Dataset and DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

print(f'Training samples: {len(train_dataset)}, Test samples: {len(test_dataset)}')

# Store input dimension for model summary
INPUT_DIM = X_train_tensor.shape[1]

Loaded dataset: (42703, 15)
Training samples: 34162, Test samples: 8541


In [2]:

# Install torchinfo for model summaries (if not already installed)
try:
    from torchinfo import summary
except ImportError:
    import sys
    !{sys.executable} -m pip install torchinfo --quiet
    from torchinfo import summary


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


def train_model(model, train_loader, criterion, optimizer, num_epochs=20):
    model.train()
    model.to(device)  # make sure model is on GPU
    for epoch in range(num_epochs):
        total_loss = 0.0
        for inputs, labels in train_loader:
            # move batch to GPU
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {total_loss/len(train_loader):.4f}")
    return model


def evaluate_model(model, test_loader, criterion):
    model.eval()
    model.to(device)
    total_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    print(f"Test Loss: {total_loss/len(test_loader):.4f} | Accuracy: {accuracy:.4f}")
    return total_loss / len(test_loader), accuracy


Using device: cuda



## 1. Simple Neural Network

Our baseline model consists of a single hidden layer with 64 neurons. It uses ReLU activation and is trained with cross‑entropy loss.


In [10]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Instantiate model
simple_model = SimpleNN(input_dim=INPUT_DIM, hidden_dim=64, output_dim=2).to(device)


# Show model summary
print(summary(simple_model, input_size=(1, INPUT_DIM)))
# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(simple_model.parameters(), lr=0.001)



# Train the model
print("Training Simple Neural Network...")
simple_model = train_model(simple_model, train_loader, criterion, optimizer, num_epochs=20)

# Evaluate the model
simple_test_loss, simple_test_acc = evaluate_model(simple_model, test_loader, criterion)

print(f"Simple NN Test Loss: {simple_test_loss:.4f}, Test Accuracy: {simple_test_acc:.4f}")


Layer (type:depth-idx)                   Output Shape              Param #
SimpleNN                                 [1, 2]                    --
├─Linear: 1-1                            [1, 64]                   1,728
├─ReLU: 1-2                              [1, 64]                   --
├─Linear: 1-3                            [1, 2]                    130
Total params: 1,858
Trainable params: 1,858
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.01
Estimated Total Size (MB): 0.01
Training Simple Neural Network...
Epoch [1/20] - Loss: 0.2163
Epoch [2/20] - Loss: 0.1766
Epoch [3/20] - Loss: 0.1725
Epoch [4/20] - Loss: 0.1707
Epoch [5/20] - Loss: 0.1693
Epoch [6/20] - Loss: 0.1682
Epoch [7/20] - Loss: 0.1670
Epoch [8/20] - Loss: 0.1666
Epoch [9/20] - Loss: 0.1663
Epoch [10/20] - Loss: 0.1656
Epoch [11/20] - Loss: 0.1652
Epoch [12/20] - Loss: 0.1648
Epoch [13/20] - Loss: 0.1642
Epoch [14/20] - 


## 2. Moderate Neural Network

This model increases complexity by stacking three hidden layers (128→64→32 neurons) and applying dropout for regularization. Such architectures can capture more complex relationships in the data while mitigating overfitting.


In [13]:

class ModerateNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ModerateNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )

    def forward(self, x):
        return self.net(x)

# Instantiate model
moderate_model = ModerateNN(input_dim=INPUT_DIM, output_dim=2).to(device)

# Show model summary
print(summary(moderate_model, input_size=(1, INPUT_DIM)))
# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(moderate_model.parameters(), lr=0.0005)

# Train the model
print("Training Moderate Neural Network...")
moderate_model = train_model(moderate_model, train_loader, criterion, optimizer, num_epochs=25)

# Evaluate the model
moderate_test_loss, moderate_test_acc = evaluate_model(moderate_model, test_loader, criterion)

print(f"Moderate NN Test Loss: {moderate_test_loss:.4f}, Test Accuracy: {moderate_test_acc:.4f}")


Layer (type:depth-idx)                   Output Shape              Param #
ModerateNN                               [1, 2]                    --
├─Sequential: 1-1                        [1, 2]                    --
│    └─Linear: 2-1                       [1, 128]                  3,456
│    └─ReLU: 2-2                         [1, 128]                  --
│    └─Dropout: 2-3                      [1, 128]                  --
│    └─Linear: 2-4                       [1, 64]                   8,256
│    └─ReLU: 2-5                         [1, 64]                   --
│    └─Dropout: 2-6                      [1, 64]                   --
│    └─Linear: 2-7                       [1, 32]                   2,080
│    └─ReLU: 2-8                         [1, 32]                   --
│    └─Linear: 2-9                       [1, 2]                    66
Total params: 13,858
Trainable params: 13,858
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.01
Input size (MB): 0.00
Forward/backwa


## 3. Advanced Deep Neural Network

The final model pushes complexity further with five hidden layers (256→128→64→64→32 neurons), batch normalization, and dropout. Deeper networks can model highly non‑linear decision boundaries and are more expressive, though they may require careful tuning.


In [14]:

class AdvancedNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(AdvancedNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )

    def forward(self, x):
        return self.net(x)

# Instantiate model
advanced_model = AdvancedNN(input_dim=INPUT_DIM, output_dim=2).to(device)

# Show model summary
print(summary(advanced_model, input_size=(1, INPUT_DIM)))
# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(advanced_model.parameters(), lr=0.0005)

# Train the model
print("Training Advanced Neural Network...")
advanced_model = train_model(advanced_model, train_loader, criterion, optimizer, num_epochs=30)

# Evaluate the model
advanced_test_loss, advanced_test_acc = evaluate_model(advanced_model, test_loader, criterion)

print(f"Advanced NN Test Loss: {advanced_test_loss:.4f}, Test Accuracy: {advanced_test_acc:.4f}")


Layer (type:depth-idx)                   Output Shape              Param #
AdvancedNN                               [1, 2]                    --
├─Sequential: 1-1                        [1, 2]                    --
│    └─Linear: 2-1                       [1, 256]                  6,912
│    └─BatchNorm1d: 2-2                  [1, 256]                  512
│    └─ReLU: 2-3                         [1, 256]                  --
│    └─Dropout: 2-4                      [1, 256]                  --
│    └─Linear: 2-5                       [1, 128]                  32,896
│    └─BatchNorm1d: 2-6                  [1, 128]                  256
│    └─ReLU: 2-7                         [1, 128]                  --
│    └─Dropout: 2-8                      [1, 128]                  --
│    └─Linear: 2-9                       [1, 64]                   8,256
│    └─BatchNorm1d: 2-10                 [1, 64]                   128
│    └─ReLU: 2-11                        [1, 64]                   --
│ 


## Results Summary

After training the three architectures, we compare their test performance:

| Model            | Test Loss | Test Accuracy |
|------------------|----------:|--------------:|
| **Simple NN**    | {{simple_loss}} | {{simple_acc}} |
| **Moderate NN**  | {{moderate_loss}} | {{moderate_acc}} |
| **Advanced NN**  | {{advanced_test_loss}} | {{advanced_acc}} |

Even though the advanced model is deeper and uses normalization and dropout, its performance may not always dramatically surpass simpler networks on small or synthetic datasets. When selecting a model, consider the complexity of your problem, available computational resources, and the risk of overfitting. 


In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# -------- Residual MLP Blocks --------

class ResidualBlock(nn.Module):
    """
    PreNorm Residual block for tabular MLPs:
      x -> LN/BN -> Linear(H,H) -> GELU -> Dropout -> Linear(H,H) -> Dropout -> + x
    """
    def __init__(self, hidden_dim: int, dropout: float = 0.2, norm: str = "layernorm"):
        super().__init__()
        if norm.lower() == "batchnorm":
            self.norm1 = nn.BatchNorm1d(hidden_dim)
            self.norm2 = nn.BatchNorm1d(hidden_dim)
        else:
            self.norm1 = nn.LayerNorm(hidden_dim)
            self.norm2 = nn.LayerNorm(hidden_dim)

        self.fc1 = nn.Linear(hidden_dim, hidden_dim, bias=True)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim, bias=True)
        self.act = nn.GELU()
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        # Pre-norm residual (works well for deep nets)
        if isinstance(self.norm1, nn.BatchNorm1d):
            y = self.norm1(x)  # BN expects (N, C)
        else:
            y = self.norm1(x)
        y = self.fc1(y)
        y = self.act(y)
        y = self.drop(y)
        if isinstance(self.norm2, nn.BatchNorm1d):
            y = self.norm2(y)
        else:
            y = self.norm2(y)
        y = self.fc2(y)
        y = self.drop(y)
        return x + y


class ExtremelyDeepResidualMLP(nn.Module):
    """
    Extremely deep residual MLP for tabular data.
    Uses: Stem -> [ResidualBlock x N] -> Head
    Target: up to ~20M parameters (controlled via hidden_dim & num_blocks).
    """
    def __init__(
        self,
        input_dim: int,
        output_dim: int = 2,
        hidden_dim: int = 1024,     # 1024 is a good sweet spot for ~20M
        num_blocks: int = 9,        # 9 blocks @1024 ≈ ~18.9M params total
        dropout: float = 0.2,
        norm: str = "layernorm"     # "layernorm" or "batchnorm"
    ):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Linear(input_dim, hidden_dim, bias=True),
            nn.GELU(),
            nn.Dropout(dropout)
        )

        blocks = [ResidualBlock(hidden_dim, dropout=dropout, norm=norm) for _ in range(num_blocks)]
        self.blocks = nn.Sequential(*blocks)

        self.head = nn.Sequential(
            nn.LayerNorm(hidden_dim) if norm.lower() != "batchnorm" else nn.Identity(),
            nn.Linear(hidden_dim, output_dim, bias=True)
        )

        # Good practice for stability with deep/wide MLPs
        self.apply(self._init_weights)

    @staticmethod
    def _init_weights(m):
        if isinstance(m, nn.Linear):
            nn.init.trunc_normal_(m.weight, std=0.02)  # small init helps deep nets
            if m.bias is not None:
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.stem(x)
        x = self.blocks(x)
        x = self.head(x)
        return x


# -------- Helper: parameter count --------
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# -------- Instantiate model (GPU ready) --------
# Assumes you already defined:
#   INPUT_DIM = X_train.shape[1]
#   device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

deep_res_model = ExtremelyDeepResidualMLP(
    input_dim=INPUT_DIM,
    output_dim=2,         # change if needed
    hidden_dim=1024,      # tune this with num_blocks to control param count
    num_blocks=9,         # 9 @ 1024 ≈ ~18.9M params
    dropout=0.2,
    norm="layernorm"      # try "batchnorm" as well; LN is often simpler for tabular
).to(device)

print("Trainable parameters:", f"{count_parameters(deep_res_model):,}")

# Optional: detailed summary (requires torchinfo)
try:
    from torchinfo import summary
    summary(deep_res_model, input_size=(1, INPUT_DIM))
except Exception as e:
    print("torchinfo summary unavailable:", e)

# Example: training setup (reuse your loaders & training/eval funcs)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(deep_res_model.parameters(), lr=1e-3, weight_decay=1e-4)

print("Training Extremely Deep Residual MLP...")
deep_res_model = train_model(deep_res_model, train_loader, criterion, optimizer, num_epochs=20)

print("Evaluating...")
deep_test_loss, deep_test_acc = evaluate_model(deep_res_model, test_loader, criterion)
print(f"Deep Residual MLP — Test Loss: {deep_test_loss:.4f}, Accuracy: {deep_test_acc:.4f}")


Trainable parameters: 18,961,410
Training Extremely Deep Residual MLP...
Epoch [1/20] - Loss: 0.2339
Epoch [2/20] - Loss: 0.2013
Epoch [3/20] - Loss: 0.2066
Epoch [4/20] - Loss: 0.1926
Epoch [5/20] - Loss: 0.1834
Epoch [6/20] - Loss: 0.1770
Epoch [7/20] - Loss: 0.1759
Epoch [8/20] - Loss: 0.1746
Epoch [9/20] - Loss: 0.1734
Epoch [10/20] - Loss: 0.1819
Epoch [11/20] - Loss: 0.1924
Epoch [12/20] - Loss: 0.1826
Epoch [13/20] - Loss: 0.1744
Epoch [14/20] - Loss: 0.1711
Epoch [15/20] - Loss: 0.1693
Epoch [16/20] - Loss: 0.1691
Epoch [17/20] - Loss: 0.1688
Epoch [18/20] - Loss: 0.1683
Epoch [19/20] - Loss: 0.1674
Epoch [20/20] - Loss: 0.1665
Evaluating...
Test Loss: 0.1739 | Accuracy: 0.9173
Deep Residual MLP — Test Loss: 0.1739, Accuracy: 0.9173
