In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [44]:
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [45]:
with open('/content/drive/MyDrive/train_1500.pkl', 'rb') as f:
    train_data = pickle.load(f)
train_features = train_data['X']
train_labels = train_data['y']

In [46]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)

In [47]:
# to tensor
train_features_tensor = torch.tensor(train_features_scaled, dtype=torch.float32).unsqueeze(1)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)

print(train_features_tensor.size())
print(train_labels_tensor.size())

torch.Size([28102, 1, 1500])
torch.Size([28102])


In [88]:
class CNN_LSTM_Model(nn.Module):
    def __init__(self, num_classes=4):
        super(CNN_LSTM_Model, self).__init__()

        # Layer 1
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=7, stride=1, padding=3)
        self.bn1 = nn.BatchNorm1d(64)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)


        # Layer 2
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)

        # Layer 3
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=164, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm1d(164)
        self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)

        # Layer 4
        self.conv4 = nn.Conv1d(in_channels=164, out_channels=200, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm1d(200)
        self.pool4 = nn.MaxPool1d(kernel_size=2, stride=2)

        # Layer 5
        self.conv5 = nn.Conv1d(in_channels=200, out_channels=200, kernel_size=3, stride=1, padding=1)
        self.bn5 = nn.BatchNorm1d(200)
        self.pool5 = nn.MaxPool1d(kernel_size=2, stride=2)

        # LSTM layer
        self.lstm = nn.LSTM(input_size=200, hidden_size=200, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(200, num_classes)

        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.pool1(torch.relu(self.bn1(self.conv1(x))))
        x = self.dropout(x)

        x = self.pool2(torch.relu(self.bn2(self.conv2(x))))
        x = self.dropout(x)

        x = self.pool3(torch.relu(self.bn3(self.conv3(x))))
        x = self.dropout(x)

        x = self.pool4(torch.relu(self.bn4(self.conv4(x))))
        x = self.dropout(x)

        x = self.pool5(torch.relu(self.bn5(self.conv5(x))))
        x = self.dropout(x)

        x = x.permute(0, 2, 1)

        _, (h_n, _) = self.lstm(x)

        x = h_n[-1]

        x = self.fc(x)

        return x

In [89]:
from torch.utils.data import DataLoader, Dataset, random_split

class Dataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.features[idx],
            'labels': self.labels[idx]
        }

In [92]:
batch_size = 32
device = 'cuda'
num_epochs = 30
weight_decay = 1e-6
learning_rate = 1e-4

In [93]:
dataset = Dataset(train_features_tensor, train_labels_tensor)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [94]:
from sklearn.metrics import f1_score
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import OneCycleLR
model = CNN_LSTM_Model().to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=learning_rate,
    betas=(0.9, 0.999),
    eps=1e-6,
    weight_decay=weight_decay
)
grad_clip = 1.0

In [95]:
# training
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_preds, total_labels = [], []

    for batch in train_loader:
        optimizer.zero_grad()
        inputs = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        logits = model(inputs)
        loss = loss_function(logits, labels)


        loss.backward()

        if grad_clip > 0:
            clip_grad_norm_(model.parameters(), max_norm=grad_clip)

        optimizer.step()
        # lr_scheduler.step()
        total_loss += loss.item()
        total_preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        total_labels.extend(labels.cpu().numpy())

    train_loss = total_loss / len(train_loader)
    train_f1 = f1_score(total_labels, total_preds, average='micro')


    model.eval()
    val_loss = 0
    val_preds, val_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            logits = model(inputs)
            loss = loss_function(logits, labels)

            val_loss += loss.item()
            val_preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

        val_loss /= len(val_loader)
        val_f1 = f1_score(val_labels, val_preds, average='micro')

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train F1: {train_f1:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}")

Epoch 1/30, Train Loss: 0.9319, Train F1: 0.5921, Val Loss: 0.8877, Val F1: 0.6038
Epoch 2/30, Train Loss: 0.7927, Train F1: 0.6484, Val Loss: 0.7751, Val F1: 0.6590
Epoch 3/30, Train Loss: 0.7418, Train F1: 0.6756, Val Loss: 0.7541, Val F1: 0.6848
Epoch 4/30, Train Loss: 0.6947, Train F1: 0.7062, Val Loss: 0.6876, Val F1: 0.7109
Epoch 5/30, Train Loss: 0.6512, Train F1: 0.7283, Val Loss: 0.6563, Val F1: 0.7244
Epoch 6/30, Train Loss: 0.6193, Train F1: 0.7422, Val Loss: 0.6182, Val F1: 0.7479
Epoch 7/30, Train Loss: 0.5957, Train F1: 0.7590, Val Loss: 0.6292, Val F1: 0.7433
Epoch 8/30, Train Loss: 0.5795, Train F1: 0.7655, Val Loss: 0.5961, Val F1: 0.7600
Epoch 9/30, Train Loss: 0.5661, Train F1: 0.7709, Val Loss: 0.5865, Val F1: 0.7632
Epoch 10/30, Train Loss: 0.5512, Train F1: 0.7787, Val Loss: 0.5686, Val F1: 0.7737
Epoch 11/30, Train Loss: 0.5384, Train F1: 0.7849, Val Loss: 0.5869, Val F1: 0.7629
Epoch 12/30, Train Loss: 0.5289, Train F1: 0.7887, Val Loss: 0.5710, Val F1: 0.7682
E

In [96]:
with open('/content/drive/MyDrive/test_1500.pkl', 'rb') as f:
    test_data = pickle.load(f)
test_features = test_data['X']
test_idx = test_data['idx']

In [102]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
test_features_scaled = scaler.fit_transform(test_features)

In [103]:
test_features_tensor = torch.tensor(test_features_scaled, dtype=torch.float32).unsqueeze(1)
test_idx_tensor = torch.tensor(test_idx, dtype=torch.long)

In [110]:
print(test_features_tensor.size())
print(test_idx_tensor.size())

torch.Size([18634, 1, 1500])
torch.Size([18634])


In [115]:
class TestDataset(Dataset):
    def __init__(self, features, indix):
        self.features = features
        self.indix = indix

    def __len__(self):
        return len(self.indix)

    def __getitem__(self, idx):
        return {
            'input_ids': self.features[idx],
            'indices': self.indix[idx]
        }

test_data = TestDataset(test_features_tensor, test_idx_tensor)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [122]:
from collections import defaultdict
model.eval()

results = defaultdict(list)

with torch.no_grad():
  for batch in test_loader:
    inputs = batch['input_ids'].to(device)
    indices = batch['indices']

    outputs = model(inputs)
    probabilities = torch.softmax(outputs, dim=1)  # Get probabilities per class

    # Store probabilities by index
    for i, idx in enumerate(indices):
        results[idx.item()].append(probabilities[i].cpu().numpy())

# Average the probabilities for each unique index and determine final class label
final_labels = {}
for idx, probs in results.items():
  avg_prob = np.mean(probs, axis=0)  # Average across all samples with the same index
  final_label = np.argmax(avg_prob)  # Get the class with the highest average probability
  final_labels[idx] = final_label

In [129]:
final_labels_array = np.array(list(final_labels.values()))
print(final_labels_array.shape)

(3411,)


In [131]:
import pandas as pd
indices = np.arange(len(final_labels_array))
final_labels_df = pd.DataFrame({
    'id': indices,
    'y': final_labels_array
})
final_labels_df.to_csv('/content/drive/MyDrive/pred.csv', index=False)