In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install datasets
!pip install transformers
!pip install accelerate

In [None]:
import datasets
import torch
import time, sys
from torch.utils.data import DataLoader

In [None]:
class chaiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
def preprocess_data(tokenizer,data):
    encodings = tokenizer(list(data["context"]), list(data["question"]), truncation=True, padding=True)
    start_positions = []
    end_positions = []
    id_ = []

    for i in range(len(data["answer_start"])):
        start_positions.append(encodings.char_to_token(i,data["answer_start"][i]))
        end_positions.append(encodings.char_to_token( i,(data["answer_start"][i] + len(data['answer_text'][i]) - 1) ))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length


    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

    return encodings

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
dataset = datasets.load_dataset("ai4bharat/IndicQA", "indicqa.hi")
df = pd.DataFrame(dataset)
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train_df,test_df = train_test_split(df test_size=0.25)

In [None]:
def preprocess_data(data):
    encodings = tokenizer(list(data["context"]),list(data["question"]), truncation=True, padding=True)
    
    start_positions = []
    end_positions = []
    for i in range(len(data["answer_start"])):
        start_positions.append(encodings.char_to_token(i,data["answer_start"][i]))
        end_positions.append(encodings.char_to_token( i, (data["answer_start"][i] + len(data['answer_text'][i])-1) ))
        
            
        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
        
        
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    
    return encodings

In [None]:
train_encodings = preprocess_data(train_df)
test_encodings = proprocess_data(test_df)

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

In [None]:
train_dataset = chaiDataset(train_encodings)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
test_dataset = chaiDataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=True)

In [None]:
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m").to("cuda")

for param in model.roberta.parameters():                #need to check the bloom version of this
    param.requires_grad = False

model.to(device)

In [None]:
optim = AdamW(model.parameters(), lr=1e-4)
model.train()

for epoch in range(40):
    epoch_loss=0
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        batch_loss=loss.item()
        optim.step()
        epoch_loss+=batch_loss
    normalized_epoch_loss = epoch_loss/(len(train_loader))
    print("Epoch {} ; Epoch loss: {} ".format(epoch+1,normalized_epoch_loss))

In [None]:
model.eval()

In [None]:
output_words,output_id = [],[]
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    start = torch.argmax(outputs["start_logits"])
    end = torch.argmax(outputs["end_logits"])
    output_tokens = tokenizer.convert_ids_to_tokens(input_ids[0][start:end+1])
    output_words.append(tokenizer.convert_tokens_to_string(output_tokens))