In [None]:
!pip install fitz
!pip install pymupdf
!pip install transformers

In [None]:
import fitz  # PyMuPDF
import re
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
import torch
from torch.cuda.amp import GradScaler, autocast

In [None]:
def extract_pdf_data(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    
    # Extract Title
    title_pattern = re.compile(r"\[\d{4}\] SGHC \d+")
    title_match = title_pattern.search(text)
    title = title_match.group(0) if title_match else "Title not found"

    data = {
        "Title": title,
        "Text": text
    }
    return data

def process_multiple_pdfs(pdf_folder):
    all_data = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            data = extract_pdf_data(pdf_path)
            all_data.append(data)
    
    df = pd.DataFrame(all_data)
    
    # Save the extracted data to a CSV file
    df.to_csv('extracted_data.csv', index=False)
    return df

In [None]:
pdf_folder = "pdfs"
df = process_multiple_pdfs(pdf_folder)
# print(df)

# Load the dataset
titles = df['Title'].tolist()
texts = df['Text'].tolist()

In [None]:
# Define custom dataset
class TextDataset(Dataset):
    def __init__(self, titles, texts, tokenizer, max_length, overlap_length):
        self.examples = []
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.overlap_length = overlap_length

        # Preprocess each text to split into overlapping chunks and associate with titles
        for title, text in zip(titles, texts):
            tokens = tokenizer(text, truncation=True, return_tensors='pt')['input_ids'].squeeze()
            start = 0
            while start < len(tokens):
                end = min(start + max_length, len(tokens))
                chunk = tokens[start:end]
                if len(chunk) > 0:  # Ensure there is content in the chunk
                    self.examples.append((title, chunk))
                start += max_length - overlap_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        title, chunk = self.examples[idx]

        # Padding if necessary
        padding_length = self.max_length - chunk.size(0)
        if padding_length > 0:
            chunk = torch.cat([chunk, torch.zeros(padding_length, dtype=torch.long)])

        attention_mask = torch.ones(chunk.size(0), dtype=torch.long)
        attention_mask[chunk == 0] = 0  # Set attention mask to 0 where there's padding

        return {
            'title': title,
            'input_ids': chunk,
            'attention_mask': attention_mask,
            'labels': chunk
        }


In [None]:
# Initialize tokenizer and dataset
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
max_length = 512
overlap_length = 50  # Define the overlap length
dataset = TextDataset(titles, texts, tokenizer, max_length, overlap_length)

In [None]:
# Ensure the device is set correctly
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Clear CUDA memory
torch.cuda.empty_cache()

In [None]:
# Initialize the model and tokenizer
model_name = 'gpt2-medium'  # or 'gpt2' for the smaller model
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [None]:
# Move the model to the appropriate device
model.to(device)

In [None]:
# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

In [None]:
# Training loop with mixed precision
num_epochs = 3
accumulation_steps = 8
batch_size = 1
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}")
    model.train()
    epoch_loss = 0
    optimizer.zero_grad()

    for i, batch in enumerate(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss = loss / accumulation_steps

        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        epoch_loss += loss.item()

        # Clear up memory after each batch
        del input_ids, attention_mask, labels, outputs
        torch.cuda.empty_cache()

    print(f"Epoch {epoch + 1} completed with average loss: {epoch_loss / len(dataloader)}")