In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-qna-dataset-final/finetune-dataset.csv
/kaggle/input/documents-dataset/documents_dataset.csv


In [2]:
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, TrainingArguments, Trainer
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch
if torch.cuda.is_available():
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    print("No GPU available, using the CPU instead.")



2024-05-04 20:32:18.871358: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-04 20:32:18.871462: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-04 20:32:19.002231: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using GPU: Tesla P100-PCIE-16GB


In [3]:

# Initialize the tokenizer with the Roberta model, specify the use of the fast tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('deepset/roberta-base-squad2')

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [4]:
class QADataset(Dataset):
    def __init__(self, encodings, answers):
        self.encodings = encodings
        self.answers = answers

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['start_positions'] = torch.tensor(self.answers['start_positions'][idx])
        item['end_positions'] = torch.tensor(self.answers['end_positions'][idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [5]:
def prepare_data(dataframe):
    contexts = dataframe['Context'].tolist()
    questions = dataframe['Question'].tolist()
    answers = dataframe['Answer'].tolist()

    encodings = tokenizer(contexts, questions, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    start_positions = []
    end_positions = []

    for i, (context, answer) in enumerate(zip(contexts, answers)):
        start_position = context.find(answer)
        start_token = encodings.char_to_token(i, start_position) if start_position != -1 else 0
        end_token = encodings.char_to_token(i, start_position + len(answer) - 1) if start_position != -1 else 0

        start_positions.append(start_token if start_token is not None else 0)
        end_positions.append(end_token if end_token is not None else start_token)

    answer_dict = {'start_positions': torch.tensor(start_positions), 'end_positions': torch.tensor(end_positions)}
    return encodings, answer_dict

In [6]:
def train_and_save_model(filepath, save_directory):
    df = pd.read_csv(filepath)
    df = df.sample(frac=0.2)
    train_df, val_df = train_test_split(df, test_size=0.1)

    train_encodings, train_answers = prepare_data(train_df)
    val_encodings, val_answers = prepare_data(val_df)

    train_dataset = QADataset(train_encodings, train_answers)
    val_dataset = QADataset(val_encodings, val_answers)

    model = RobertaForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')
    training_args = TrainingArguments(
        output_dir=save_directory,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        save_strategy="epoch"  # Save the model at the end of each epoch
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()

    # Save the model and the tokenizer
    model_path = f"{save_directory}/model"
    tokenizer.save_pretrained(model_path)
    model.save_pretrained(model_path)
    print(f"Model saved to {model_path}")


In [7]:
train_and_save_model('/kaggle/input/documents-dataset/documents_dataset.csv', './trained_model')


model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Model saved to ./trained_model/model


In [8]:
# Set the path to your model directory
model_dir = "/kaggle/working/trained_model/model"

# Load the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained(model_dir)

# Load the model
model = RobertaForQuestionAnswering.from_pretrained(model_dir)


In [9]:
def answer_question(model, tokenizer, question, context):
    # Prepare the device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    # Encode the question and context so that they are ready to be fed to the model
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the correct device

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted start and end positions of the answer in the context
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    # Convert token indices to the actual answer string
    answer_tokens = inputs['input_ids'][0][answer_start:answer_end]
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer_tokens))

    return answer

# Example usage
context = "Delhi is capital of India"
question = "what is India capital?"
answer = answer_question(model, tokenizer, question, context)
print("Answer:", answer)

Answer: Delhi


In [10]:
%cd /kaggle/working

/kaggle/working


In [11]:
ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m[01;34mtrained_model[0m/


In [12]:
cd trained_model/

/kaggle/working/trained_model


In [13]:
ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m[01;34mmodel[0m/


In [14]:
from IPython.display import FileLinks
FileLinks(r'model/')

In [15]:
!zip -r file.zip model/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: model/ (stored 0%)
  adding: model/merges.txt (deflated 53%)
  adding: model/config.json (deflated 49%)
  adding: model/tokenizer_config.json (deflated 75%)
  adding: model/model.safetensors (deflated 8%)
  adding: model/special_tokens_map.json (deflated 85%)
  adding: model/vocab.json (deflated 59%)
  adding: model/tokenizer.json (deflated 72%)


In [16]:
ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


file.zip  [0m[01;34mmodel[0m/
