### **Question Answering Project Using Fine-Tune LLM Model**

In [17]:
from typing import Any

from databricks.sdk import DatabaseAPI
from pytorch_lightning.utilities.types import STEP_OUTPUT
from sympy.sets.sets import set_function
from tinycss2 import tokenizer

''' Import all import Libraries '''
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [18]:
Device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {Device}')

Device: cpu


In [19]:
''' Dataset Path '''
Root_dir = '/Users/mahadiur/Desktop/Bongodev MLops Projects/Question Answering Using Fine-Tune LLM Model/Data'

test_path = os.path.join(Root_dir, 'qna_test.csv')
train_path = os.path.join(Root_dir, 'qna_train.csv')

In [20]:
''' Load Dataset '''
train_dataset = pd.read_csv(train_path, encoding='Iso-8859-1', on_bad_lines='skip', engine='python')
test_data = pd.read_csv(test_path, encoding='ISO-8859-1', on_bad_lines='skip', engine='python')

train_dataset.head()

Unnamed: 0,context,question,answers
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


### **Download Hugging face model & Tokenizer**

In [21]:

Fine_Tune_Model_name = 'deepset/roberta-base-squad2'

Fine_Tune_Model = AutoModelForQuestionAnswering.from_pretrained(Fine_Tune_Model_name)
Tokenizer = AutoTokenizer.from_pretrained(Fine_Tune_Model_name)

### **Explore Dataset**

In [22]:
''' Dataset column '''
train_dataset.columns

Index(['context', 'question', 'answers'], dtype='object')

In [23]:
idx = 0
context = train_dataset.iloc[idx]['context']
print(context)

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


In [24]:
question = train_dataset.iloc[idx]['question']
print(question)

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?


In [25]:
answers = eval(train_dataset.iloc[idx]['answers'])
print(answers['text'])
print(answers['answer_start'])

['Saint Bernadette Soubirous']
[515]


### **Data (QnA Part 1)**

In [26]:
# Dataset class
class QnADataset(Dataset):
    # Dataset
    def __init__(self, dataset, tokenizer):
        super().__init__()
        self.dataset = dataset
        self.tokenizer = tokenizer
    # Find length of dataset
    def __len__(self):
        return len(self.dataset)
    # Ready a Single example
    def __getitem__(self, idx):
        context = self.dataset[idx]['context']
        question = self.dataset[idx]['question']
        answers = eval(self.dataset[idx]['answers'])['text'][0]
        answer_start = eval(self.dataset[idx]['answers'])['answer_start'][0]

        try:
            tokens = self.tokenizer.encode_plus(
                context,
                question,
                add_special_tokens=True,
                max_length=512,
                truncation=True,
                padding="max_length",
                return_tensors='pt',
                return_offsets_mapping=True,
            )
        except Exception as e:
            tokens = {
                'input_ids': torch.zeros(512, dtype=torch.long),
                'attention_mask': torch.zeros(512, dtype=torch.long),
                'offset_mapping': torch.zeros((512,2), dtype=torch.long),
            }


        input_ids = tokens['input_ids'].squeeze()
        attention_mask = tokens['attention_mask'].squeeze()
        offset_mapping = tokens['offset_mapping'].squeeze()

        start_position = 0
        end_position = 0
        answer_end = start_position + len(answers)

        for i, (start_char, end_char) in enumerate(offset_mapping):
            if start_position <= answer_start < end_position:
                start_position = i
            if start_position < answer_end <= end_position:
                end_position = i
                break

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'start_positions': torch.tensor([start_position], dtype=torch.long),
            'end_positions': torch.tensor([end_position], dtype=torch.long),
        }



In [27]:
# Data Module Class
class QnADataModule(pytorch_lightning.LightningDataModule):
    def __init__(self):
        super().__init__()
        pass

    def setup(self, stage=None):
        pass

    def train_dataloader(self):
        pass

    def test_dataloader(self):
        pass

    def collate_fn(self, batch):
        pass


In [28]:
Data_Module = QnADataModule()

### **Model (QnA Part 2)**

In [29]:
# Model
class QnAModel(pytorch_lightning.LightningModule):
    def __init__(self):
        super().__init__()
        pass

    def forward(self):
        pass

    def compute_loss(self, batch):
        pass

    def training_step(self, batch, batch_idx):
        pass

    def test_step(self, batch, batch_idx):
        pass

    def configure_optimizers(self):
        pass


In [30]:
Model = QnAModel()

### **Training (QnA Part 3)**

In [31]:
Training = pytorch_lightning.Trainer( max_epochs=10)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
