<a href="https://colab.research.google.com/github/Narendra1817/Narendra1718/blob/main/Untitled7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch numpy scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.cluster import KMeans
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class GroupAttention(nn.Module):
    def __init__(self, d_model, n_heads, max_groups=32):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        self.max_groups = max_groups

        # Projection matrices
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def group_softmax(self, scores, group_counts):
        max_scores = scores.max(dim=-1, keepdim=True)[0]
        exp_scores = torch.exp(scores - max_scores)
        weighted_exp = exp_scores * group_counts.unsqueeze(-2)
        sum_exp = weighted_exp.sum(dim=-1, keepdim=True)
        return exp_scores / sum_exp

    def forward(self, x):
        batch_size, seq_len, _ = x.shape

        # Project to Q, K, V
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        # Reshape for multi-head attention
        Q = Q.view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)

        # Compute attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))

        # Initialize attention weights
        attn_weights = torch.zeros_like(scores)

        for head_idx in range(self.n_heads):
            # Get keys for this head
            k_head = K[:, head_idx]

            # Flatten across batch dimension for clustering
            k_flattened = k_head.reshape(-1, self.head_dim).detach().cpu().numpy()

            # Skip clustering if not enough samples
            if k_flattened.shape[0] < 2:
                attn_weights[:, head_idx] = F.softmax(scores[:, head_idx], dim=-1)
                continue

            # Determine number of groups
            n_groups = min(self.max_groups, k_flattened.shape[0])

            # Perform clustering
            kmeans = KMeans(n_clusters=n_groups, n_init=10)
            try:
                group_ids = kmeans.fit_predict(k_flattened)
                group_ids = torch.tensor(group_ids, device=device).reshape(batch_size, seq_len)

                # Compute group counts
                unique, counts = torch.unique(group_ids, return_counts=True)
                group_counts = torch.zeros(n_groups, device=device)
                group_counts[unique] = counts.float()

                # Compute group attention
                head_scores = scores[:, head_idx]
                attn_weights[:, head_idx] = self.group_softmax(head_scores, group_counts)
            except:
                # Fallback to standard softmax if clustering fails
                attn_weights[:, head_idx] = F.softmax(scores[:, head_idx], dim=-1)

        # Compute weighted sum of values
        output = torch.matmul(attn_weights, V)

        # Reshape and project back
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        output = self.W_o(output)

        return output

# Rest of the implementation remains similar but simplified
class RITAEncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, max_groups):
        super().__init__()
        self.attention = GroupAttention(d_model, n_heads, max_groups)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        attn_out = self.attention(x)
        x = self.norm1(x + attn_out)
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        return x

class TimeAwareConv(nn.Module):
    def __init__(self, in_channels, d_model, kernel_size=5):
        super().__init__()
        self.conv = nn.Conv1d(in_channels, d_model, kernel_size, padding=kernel_size//2)

    def forward(self, x):
        x = self.conv(x)
        x = x.transpose(1, 2)
        return x

class RITA(nn.Module):
    def __init__(self, in_channels, d_model=64, n_heads=2, n_layers=2, max_groups=32):
        super().__init__()
        self.conv = TimeAwareConv(in_channels, d_model)
        self.layers = nn.ModuleList([
            RITAEncoderLayer(d_model, n_heads, max_groups)
            for _ in range(n_layers)
        ])
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))

    def forward(self, x):
        x = self.conv(x)
        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        for layer in self.layers:
            x = layer(x)
        return x[:, 0]  # Return CLS token representation

# Synthetic dataset with proper dimensions
class SyntheticDataset(Dataset):
    def __init__(self, num_samples=1000, seq_len=200, num_channels=3, num_classes=5):
        self.data = torch.randn(num_samples, num_channels, seq_len)
        self.labels = torch.randint(0, num_classes, (num_samples,))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Create dataset and dataloader with matching dimensions
dataset = SyntheticDataset(seq_len=200, num_channels=3)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize model with matching dimensions
model = RITA(in_channels=3, d_model=64, n_heads=2, n_layers=2, max_groups=32).to(device)
classifier = nn.Linear(64, 5).to(device)

# Training setup
optimizer = torch.optim.AdamW(list(model.parameters()) + list(classifier.parameters()), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(5):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch_x, batch_y in dataloader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        features = model(batch_x)
        logits = classifier(features)
        loss = criterion(logits, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(logits, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

    print(f'Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}, Accuracy = {100*correct/total:.2f}%')

Using device: cuda
Epoch 1: Loss = 1.6359, Accuracy = 19.60%
Epoch 2: Loss = 1.6125, Accuracy = 22.50%
Epoch 3: Loss = 1.6116, Accuracy = 20.40%
Epoch 4: Loss = 1.6122, Accuracy = 18.90%
Epoch 5: Loss = 1.6122, Accuracy = 19.40%
