In [1]:
!pip install datasets evaluate rouge_score accelerate



In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, GPTQConfig, StoppingCriteria, StoppingCriteriaList, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import nltk
from random import randint
import json

In [3]:
!pip -q kaggle

ERROR: unknown command "kaggle"


In [4]:
from google.colab import files
files.upload()

!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [5]:
!kaggle datasets download -d stanfordu/stanford-question-answering-dataset

Dataset URL: https://www.kaggle.com/datasets/stanfordu/stanford-question-answering-dataset
License(s): CC-BY-SA-4.0
stanford-question-answering-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
import zipfile
zip_ref = zipfile.ZipFile('/content/stanford-question-answering-dataset.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [7]:
import os
import json

from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DefaultDataCollator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
with open('dev-v1.1.json', 'r') as _f:
    dev = json.load(_f)

with open('train-v1.1.json', 'r') as _f:
    train = json.load(_f)

In [10]:
def createDataset(_data):
    contexts = []
    questions = []
    answers = []

    for i in _data['data']:
        for j in i['paragraphs']:
            context = j['context']
            for k in j['qas']:
                question = k['question']
                for m in k['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append({'text': m['text'], 'answer_start': m['answer_start']})

    return Dataset.from_dict({
    'context': contexts,
    'question': questions,
    'answers': answers
    })

In [11]:
TRAIN = createDataset(train)
TEST = createDataset(dev)

TRAIN = TRAIN.train_test_split(test_size=0.2, seed=4444)

In [12]:
TRAIN

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers'],
        num_rows: 70079
    })
    test: Dataset({
        features: ['context', 'question', 'answers'],
        num_rows: 17520
    })
})

In [13]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
data_collator = DefaultDataCollator()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def preprocess_function(examples):

    questions = [q.strip() for q in examples["question"]]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=500,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"]
        end_char = answer["answer_start"] + len(answer["text"])
        sequence_ids = inputs.sequence_ids(i)


        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1


        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [15]:
train_tokenized = TRAIN.map(preprocess_function, batched=True, remove_columns=TEST.column_names)
test_tokenized = TEST.map(preprocess_function, batched=True, remove_columns=TEST.column_names)

Map:   0%|          | 0/70079 [00:00<?, ? examples/s]

Map:   0%|          | 0/17520 [00:00<?, ? examples/s]

Map:   0%|          | 0/34726 [00:00<?, ? examples/s]

In [16]:
training_args = TrainingArguments(
    output_dir="my_model",
    evaluation_strategy="epoch",
    save_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=False,
    report_to = 'none',
    load_best_model_at_end = True,
    overwrite_output_dir = True,
    metric_for_best_model= "eval_loss",
    greater_is_better= False
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized["train"],
    eval_dataset=train_tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.4087,1.241505
2,1.1803,1.181343


TrainOutput(global_step=8760, training_loss=1.5161229660521904, metrics={'train_runtime': 7111.7384, 'train_samples_per_second': 19.708, 'train_steps_per_second': 1.232, 'total_flos': 1.7882887885044e+16, 'train_loss': 1.5161229660521904, 'epoch': 2.0})

In [19]:
trainer.save_model('working/final_model')

In [20]:
trainer.evaluate(eval_dataset=test_tokenized)

{'eval_loss': 1.2982655763626099,
 'eval_runtime': 554.8896,
 'eval_samples_per_second': 62.582,
 'eval_steps_per_second': 3.912,
 'epoch': 2.0}

In [21]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="working/final_model")

In [22]:
context = """India, a captivating mosaic of cultures, religions, and traditions, enthralls with its unparalleled diversity. Its ancient civilization, steeped in history and spanning millennia, has profoundly influenced global heritage. From the vibrant markets of Delhi to the idyllic shores of Goa, India's landscapes boast a breathtaking array of sights and experiences. Chennai, a city in the state of tamil nadu is expected to be the next tech hub in India following bangalore and mumbai. Yet, amidst the splendor, persistent challenges such as poverty and environmental degradation demand attention. With 28 states and 8 union territories, India stands as a testament to unity in diversity. In this vast expanse, Mumbai, the bustling metropolis pulsating with life, claims the title of the largest city by population. As India strides confidently into the future, it remains a land of contrasts, blending tradition with modernity, and embracing its complexities with resilience and grace."""

question_1 = 'Which city is the future tech hub?'
question_2 = 'Which city has the most population?'
question_3 = 'How many states and union territories are there in india?'

print(question_answerer(question=question_1, context=context))
print(question_answerer(question=question_2, context=context))
print(question_answerer(question=question_3, context=context))

{'score': 0.33384546637535095, 'start': 360, 'end': 367, 'answer': 'Chennai'}
{'score': 0.6365388035774231, 'start': 705, 'end': 711, 'answer': 'Mumbai'}
{'score': 0.35783636569976807, 'start': 597, 'end': 612, 'answer': '28 states and 8'}
