In [None]:
!pip install --upgrade h5py
!pip install --upgrade typing-extensions
!pip install --upgrade wheel

In [None]:
!pip install --upgrade transformers torch tqdm

In [None]:
import sys
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForCausalLM
from torch.optim import Adam
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

In [None]:
# Add files to system for direct calling of class
sys.path.append('/kaggle/input/morya-92m')
print(sys.path)

from train import ChatData

In [None]:
# model_path = './distil_morya'
model = GPT2LMHeadModel.from_pretrained('gpt2') # this is where the connection issue is
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # this as well
tokenizer.pad_token = tokenizer.eos_token
special_tokens_dict = {'bos_token': '<startofstring>',
                       'eos_token': '<endofstring>',
                       'additional_special_tokens': ['<bot> :']}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Load dataset
chatData = ChatData('/kaggle/input/morya-92m/custom_conversation_dataset.json', tokenizer)
sample_data = chatData[0] # debug line
print(sample_data)

In [None]:
# Prepare DataLoader
data_loader = DataLoader(chatData, batch_size=2, shuffle=True)

In [None]:
from torch.optim import AdamW

In [None]:
# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

In [None]:
from torch.cuda.amp import autocast, GradScaler

In [None]:
!pip install --upgrade torch torchvision torchaudio --user

In [None]:
# Install PyTorch with CUDA support
!pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 --user -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.backends.cudnn.version())

In [None]:
# Check CUDA availability and select device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print('Using GPU:', torch.cuda.get_device_name(0))  # Print GPU name
else:
    print('CUDA is not available. Using CPU.')
model = model.to(device)

In [None]:
# Correct device setup

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using GPU:', torch.cuda.get_device_name(0))  # Print GPU name
else:
    device = torch.device('cpu')
    print('CUDA is not available. Using CPU.')
optimizer = Adam(model.parameters(), lr=5e-5)

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb=512'

In [None]:
print(f"Model memory size: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
print(f"Optimizer memory size: {torch.cuda.memory_reserved() / 1024 ** 2:.2f} MB")

In [None]:
torch.cuda.empty_cache()

In [None]:
print(f"Model memory size: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
print(f"Optimizer memory size: {torch.cuda.memory_reserved() / 1024 ** 2:.2f} MB")

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
scaler = GradScaler()

accumulation_steps = 4  # Accumulate gradients over 4 steps
num_epochs = 6
for epoch in range(num_epochs + 1):
    model.train()
    total_loss = 0
    progress_bar = tqdm(enumerate(data_loader), total=len(data_loader), desc=f'Epoch {epoch+1}/{num_epochs}')

    for batch_idx, batch in progress_bar:
        input_ids, attention_mask = batch[0].to(device), batch[1].to(device)  # Move batch to GPU
        labels = input_ids.detach().clone().to(device)  # Move labels to the same device

        optimizer.zero_grad()

        # Use autocast for mixed precision training
        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Backward pass scaled with scaler
        scaler.scale(loss).backward()

        # Perform optimizer step after accumulation_steps
        if (batch_idx + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

        # Print progress
        if (batch_idx + 1) % 100 == 0:  # Adjust print frequency
            current_loss = loss.item()
            print(f'Epoch: {epoch+1}, Batch: {batch_idx+1}/{len(data_loader)}, Loss: {current_loss}')

        # Clear GPU memory
        del input_ids, attention_mask, outputs, loss, labels
        torch.cuda.empty_cache()

    avg_loss = total_loss / len(data_loader)
    print(f'Epoch: {epoch+1}, Average Loss: {avg_loss}')

In [None]:
import os
os.environ["TRANSFORMERS_USE_SAFETENSORS"] = "0"

model.save_pretrained('/kaggle/working')
tokenizer.save_pretrained('/kaggle/working')