In [1]:
import torch
import torch.nn as nn
print("CUDA available: ", torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

CUDA available:  True


In [2]:
import importlib
import warnings
warnings.filterwarnings("ignore")

import os
current_directory = os.getcwd()
print(current_directory)

import time
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from jupyterthemes import jtplot
jtplot.style(theme="monokai", context="notebook", ticks=True, grid=True)
import seaborn as sns

%load_ext autoreload
%autoreload 2

/home/lucy/Documents/MachineLearning/xLSTM


In [30]:
cleaned_tweets_df = pd.read_csv('./DataSet/cleaned_tweets.csv', index_col=0)
cleaned_tweets_df = cleaned_tweets_df[:500]

In [31]:
import os
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import GPT2Tokenizer
from tqdm import tqdm
import torch.cuda.amp as amp
import matplotlib.pyplot as plt

# Define your dataset class
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = [str(text) for text in texts if text is not None]  # Ensure all texts are strings and non-null
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].squeeze()
        return input_ids, torch.tensor(label, dtype=torch.long)


# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

train_texts = cleaned_tweets_df['tweet'].tolist()  # Your training texts
train_labels = cleaned_tweets_df['label'].tolist()  # Your training labels


# Create the dataset
dataset = TweetDataset(train_texts, train_labels, tokenizer)

# Split the dataset into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create the DataLoader
batch_size = 10  # Adjust based on your GPU memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [32]:
%load_ext autoreload
%autoreload 2

import sys
import os
import shutil
sys.path.append("..")

# Clean the torch extensions cache
extension_cache = "/home/lucy/.cache/torch_extensions/"
if os.path.exists(extension_cache):
    shutil.rmtree(extension_cache)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
vocab_size = tokenizer.vocab_size
vocab_size 

50257

In [46]:
from omegaconf import OmegaConf
from dacite import from_dict
from dacite import Config as DaciteConfig
from xlstm.xlstm import xLSTMLMModel, xLSTMLMModelConfig

xlstm_cfg = """ 
vocab_size: 50257
mlstm_block:
  mlstm:
    conv1d_kernel_size: 2
    qkv_proj_blocksize: 2
    num_heads: 2
slstm_block:
  slstm:
    backend: vanilla
    num_heads: 2
    conv1d_kernel_size: 2
    bias_init: powerlaw_blockdependent
  feedforward:
    proj_factor: 1.3
    act_fn: gelu
context_length: 256
num_blocks: 2
embedding_dim: 128
slstm_at: [1]
"""
cfg = OmegaConf.create(xlstm_cfg)
cfg = from_dict(data_class=xLSTMLMModelConfig, data=OmegaConf.to_container(cfg), config=DaciteConfig(strict=True))
model = xLSTMLMModel(cfg).to(device)

# x = torch.randint(0, vocab_size, size=(4, 256)).to("cuda")
# xlstm_stack = xlstm_stack.to("cuda")
# y = xlstm_stack(x)
# y.shape[1:] == (256, vocab_size)


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 2.94 GiB of which 1.62 MiB is free. Process 35729 has 395.38 MiB memory in use. Including non-PyTorch memory, this process has 2.55 GiB memory in use. Of the allocated memory 2.25 GiB is allocated by PyTorch, and 231.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [47]:
# Training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, betas=(0.9, 0.99))
criterion = torch.nn.CrossEntropyLoss()
# Use mixed precision
scaler = torch.cuda.amp.GradScaler()
# Gradient accumulation steps
accumulation_steps = 4  # Simulate a batch size of 8 * 2 = 16

# Lists to store training and validation losses for each batch
train_losses = []
val_losses = []

# Training loop with hyperparameters
for epoch in range(2):  # Train for 3 epochs
    model.train()
    epoch_loss = 0
    with tqdm(train_dataloader, unit="batch") as tepoch:
        for i, batch in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch + 1}")

            input_ids, labels = batch

            # Move batch tensors to the same device as the model
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            with amp.autocast():
                # Forward pass
                #logits, loss = model(input_ids, targets=input_ids)
                val_outputs = model(input_ids)

                # Normalize the loss to the effective batch size
                #loss = loss / accumulation_steps
                loss = nn.functional.cross_entropy(
                    val_outputs.view(-1),
                    labels.view(-1),
                    ignore_index=-1,
                )

            # Backward pass with mixed precision
            scaler.scale(loss).backward()

            if (i + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            epoch_loss += loss.item() * accumulation_steps
            train_losses.append(loss.item() * accumulation_steps)
            tepoch.set_postfix(loss=loss.item() * accumulation_steps)

    avg_epoch_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} Average Training Loss: {avg_epoch_loss:.4f}")

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            with amp.autocast():
                logits, loss = model(input_ids, targets=input_ids)
                val_loss += loss.item()
                val_losses.append(loss.item())

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1} Average Validation Loss: {avg_val_loss:.4f}")

Epoch 1:   0%|          | 0/40 [00:00<?, ?batch/s]




OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 2.94 GiB of which 1.62 MiB is free. Process 35729 has 395.38 MiB memory in use. Including non-PyTorch memory, this process has 2.55 GiB memory in use. Of the allocated memory 2.26 GiB is allocated by PyTorch, and 228.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)