In [53]:
!ls ../input/chaii-hindi-and-tamil-question-answering

In [54]:
import pandas as pd
import transformers
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
import numpy as np
from transformers import TrainingArguments, Trainer
from transformers import default_data_collator
import collections
import random
import csv
random_seed = 42

torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
np.random.seed(random_seed)
random.seed(random_seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
%env WANDB_DISABLED=True

In [55]:
# model_name = "deepset/xlm-roberta-base-squad2"
# model_name = '../input/xlm-roberta-squad2/deepset/xlm-roberta-base-squad2'
model_name = '../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2'
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
# helper functions
def set_max_len_encd_stride(max_len, encd_stride):
    max_len_ = max_len
    encd_stride_ = encd_stride
    return max_len_, encd_stride_

## Preprocessing 

In [5]:
train_data_chai =  pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/train.csv", encoding = "utf-8")
train_data_chai = train_data_chai.sample(frac=1, random_state = random_seed)

In [6]:
# train_data_chai['context'] = train_data_chai['context'].apply(lambda x: x.lstrip())
train_data_chai['question'] = train_data_chai['question'].apply(lambda x: x.lstrip())

In [7]:
train_data_chai= train_data_chai[:].reset_index(drop=True)

valid_data_chai = train_data_chai[-10:].reset_index(drop=True)

# train_data_chai= train_data_chai[:-512].reset_index(drop=True)

# valid_data_chai = train_data_chai[-512:].reset_index(drop=True)

In [8]:
train_data_chai.head()

In [9]:
data_mldq = pd.read_csv("../input/external-data-mlqa-xquad-preprocessing/mlqa_hindi.csv", encoding = "utf-8")
data_xquad = pd.read_csv("../input/external-data-mlqa-xquad-preprocessing/xquad.csv", encoding = "utf-8")

In [10]:
data_mldq.head()

In [11]:
# data_mldq['context'] = data_mldq['context'].apply(lambda x: x.lstrip())
data_mldq['question'] = data_mldq['question'].apply(lambda x: x.lstrip())
data_xquad['question'] = data_xquad['question'].apply(lambda x: x.lstrip())

In [12]:
# data_xquad.head()

In [13]:
train_data = pd.concat([train_data_chai, data_mldq])
# train_data = pd.concat([train_data_chai, data_mldq, data_xquad])
# train_data = train_data_chai

In [14]:
valid_data_chai

In [15]:
valid_data_chai_tamil = valid_data_chai[valid_data_chai.language == 'tamil']
valid_data_chai_tamil

In [16]:
valid_data_chai_hindi = valid_data_chai[valid_data_chai.language == 'hindi']
valid_data_chai_hindi

In [17]:
train_data.head()

In [18]:
train_data = train_data.reset_index(drop=True)
train_data

In [19]:
# train_data = train_data.sample(frac=1, random_state = random_seed)

In [20]:
train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data_chai)
train_dataset

In [21]:
max_len, encoding_stride = set_max_len_encd_stride(384, 128)
print(max_len)
print(encoding_stride)


In [22]:
train_encoding = tokenizer(train_dataset['question'], train_dataset['context'],
                             truncation = 'only_second', max_length=max_len, stride = encoding_stride, 
                             return_overflowing_tokens = True, return_offsets_mapping = True,
                             padding = 'max_length')
valid_encoding = tokenizer(valid_dataset['question'], valid_dataset['context'],
                             truncation = 'only_second', max_length=max_len, stride = encoding_stride, 
                             return_overflowing_tokens = True, return_offsets_mapping = True,
                             padding = 'max_length')

In [23]:
class proc_data():
    
    def handle_no_answer(self, data_encoding, answer_start, cls_idx):
        if len(answer_start) == 0:
            data_encoding['start_positions'].append(cls_idx)
            data_encoding['end_positions'].append(cls_idx)
            return True
        return False
    
    def correct_tokens(self, seq_ids):
        seq_ids = list(seq_ids)
        answer_token_start_index = seq_ids.index(1)
        seq_ids_reverse = seq_ids[::-1]
        answer_token_end_index_reverse = seq_ids_reverse.index(1)
        answer_token_end_index = len(seq_ids) - 1 - answer_token_end_index_reverse
        return answer_token_start_index, answer_token_end_index
    
    def update_data_encoding(self, data_encoding, start, end):
        data_encoding['start_positions'].append(start)
        data_encoding['end_positions'].append(end)
        
    def get_encodings(self, data_encoding, data_sample, tokenizer, offset_mapping, sample_mapping):
        for i, offsets in enumerate(offset_mapping):
            input_ids = data_encoding['input_ids'][i]
            seq_ids = data_encoding.sequence_ids(i)
            sample_index = sample_mapping[i]
            answer_start = [data_sample['answer_start'][sample_index]]
            text = data_sample['answer_text'][sample_index]

            no_answer_handled = self.handle_no_answer(data_encoding, answer_start, input_ids.index(tokenizer.cls_token_id))
            if not no_answer_handled:
                answer_start_char_index = answer_start[0]
                answer_end_char_index = answer_start_char_index + len(text)

                answer_token_start_index, answer_token_end_index = self.correct_tokens(seq_ids)

                if offsets[answer_token_start_index][0] <= answer_start_char_index and offsets[answer_token_end_index][1] >= answer_end_char_index:
                    while offsets[answer_token_end_index][1] >= answer_end_char_index:
                        answer_token_end_index -=1
                    while answer_token_start_index <len(offsets) and offsets[answer_token_start_index][0] <= answer_start_char_index:
                        answer_token_start_index +=1
                        
                    self.update_data_encoding(data_encoding, answer_token_start_index-1, answer_token_end_index+1)
                else:
                    self.handle_no_answer(data_encoding, [], input_ids.index(tokenizer.cls_token_id)) # [] is just a dummpy,  a 0 element list for the function to work correctly. change this afterwards
        return data_encoding, offset_mapping, sample_mapping
    
    def add_start_end_token_positions(self, data_encoding, data_sample, tokenizer, train = True):
        offset_mapping = data_encoding.pop('offset_mapping')
        sample_mapping = data_encoding.pop("overflow_to_sample_mapping")
        if train:
            data_encoding['start_positions'] = []
            data_encoding['end_positions'] = []

            data_encoding, offset_mapping, sample_mapping = self.get_encodings(data_encoding, data_sample, tokenizer, offset_mapping, sample_mapping)

        return data_encoding, offset_mapping, sample_mapping

In [24]:
proc_data_obj = proc_data()

In [25]:
train_encd_proc, train_offset_mapping, training_sample_mapping = proc_data_obj.add_start_end_token_positions(train_encoding, train_data, tokenizer, True)


In [26]:
valid_encd_proc, valid_offset_mapping, valid_sample_mapping = proc_data_obj.add_start_end_token_positions(valid_encoding, valid_data_chai, tokenizer, True)

In [27]:
# valid_encd_proc.keys()

In [28]:
# print(list(train_encd_proc.keys()))
# print(train_encd_proc['input_ids'][0])
# print(train_encd_proc['input_ids'][1])

print(len(train_encd_proc['start_positions']))
print(len(train_encd_proc['end_positions']))
print(len(train_encd_proc['input_ids']))
# print(len(train_encd_proc['attention_masks']))


In [29]:
class ChaiiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [30]:
train_dataset_ = ChaiiDataset(train_encd_proc)
valid_dataset_ = ChaiiDataset(valid_encd_proc)

In [31]:
train_args = TrainingArguments(
    output_dir='./results',          
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,  
    learning_rate = 3e-5, # default 5e-5
    gradient_accumulation_steps = 8,
    warmup_steps=0.1,
    weight_decay=0.01,
    num_train_epochs = 1,

    # logging_dir='./logs',            
#     logging_steps=100,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
#     eval_steps = 100,
    save_total_limit = 1,
    load_best_model_at_end = True,
)

In [32]:
trainer = Trainer(
    model=model,                         
    args=train_args,                  
    train_dataset=train_dataset_,         
    eval_dataset=valid_dataset_,
    data_collator = default_data_collator
)

In [33]:
trainer.train()

## Start

In [34]:
test_data =  pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv", encoding = "utf-8")
# test_data['context'] = test_data['context'].apply(lambda x: x.lstrip())
test_data['question'] = test_data['question'].apply(lambda x: x.lstrip())

test_dataset = Dataset.from_pandas(test_data)

test_encoding = tokenizer(test_dataset['question'], test_dataset['context'],
                             truncation = 'only_second', max_length=max_len, stride = encoding_stride, 
                             return_overflowing_tokens = True, return_offsets_mapping = True,
                             padding = 'max_length')

test_encd_proc, test_offset_mapping,test_sample_mapping = proc_data_obj.add_start_end_token_positions(test_encoding, test_data, tokenizer, False)
test_dataset_encd = ChaiiDataset(test_encd_proc)

In [35]:
test_data

In [36]:
test_encd_proc.keys()

In [37]:
test_encoding.keys()

In [38]:
predictions = trainer.predict(test_dataset_encd)

## Validset prediction

In [39]:
valid_encoding_ = tokenizer(valid_dataset['question'], valid_dataset['context'],
                             truncation = 'only_second', max_length=max_len, stride = encoding_stride, 
                             return_overflowing_tokens = True, return_offsets_mapping = True,
                             padding = 'max_length')

In [40]:
valid_encoding_.keys()

In [41]:
valid_encd_proc_, valid_offset_mapping_,valid_sample_mapping_ = proc_data_obj.add_start_end_token_positions(valid_encoding_, valid_dataset, tokenizer, False)
# valid_encd_proc_.pop('id')
valid_dataset_encd = ChaiiDataset(valid_encd_proc_)

In [42]:
valid_encoding_.keys()

In [43]:
valid_predictions = trainer.predict(valid_dataset_encd)

In [44]:
# commented code dekh lena.
def get_default_corr_ans_dict():
    return {"text": "", "score": 0.0}

def get_logits_mapping(start_logits, end_logits, offset_mapping, index, check_best):
    start_logit, end_logit, offsets = np.argsort(start_logits[index]), np.argsort(end_logits[index]), offset_mapping[index]
    
    return start_logit[-1 : -check_best - 1 : -1].tolist(), end_logit[-1 : -check_best - 1 : -1].tolist(), offsets, start_logits[index], end_logits[index]

def post_processing(data, data_encoding, st_logits, en_logits , sample_mapping, offset_mapping, answer_length_limit = 30, check_best = 20):
    start_logits = st_logits
    end_logits = en_logits
    mapping_data_index_sample = collections.defaultdict(list) # of the form [[1,2,3,4,], [5,6,7,8]]        
    for sample_idx, data_idx in enumerate(sample_mapping):
        mapping_data_index_sample[data_idx].append(sample_idx)
    
    predicted_answers = collections.OrderedDict()
    for data_index, data_point in enumerate(data):
        encoding_idxs = mapping_data_index_sample[data_index]
        context = data_point['context']
        answers = {}
        for index in encoding_idxs:
            assert True
            selected_start_logits_index, selected_end_logits_index, offsets, start_logit, end_logit = get_logits_mapping(start_logits, end_logits, offset_mapping, index, check_best)
            
            seq_ids = data_encoding.sequence_ids(index)
            for start in selected_start_logits_index:
                if(start >= len(offsets) or seq_ids[start] != 1):
                    continue
                for end in selected_end_logits_index:
                    if(end >= len(offsets) or seq_ids[end]!= 1 or end < start or (end-start) + 1> answer_length_limit):
                        continue
                    key = start_logit[start] + end_logit[end]
                    answers[key] = context[offsets[start][0]: offsets[end][1]]
                    
        scores = list(answers.keys())
        if len(scores) > 0:
            sorted_scores = sorted(scores)
            best_score = sorted_scores[-1]
            best_answer = answers[best_score]

        else:
            best_answer = ""

        predicted_answers[data_point["id"]] = best_answer
        
    return predicted_answers

In [45]:
final_predictions_valid = post_processing(valid_dataset, valid_encoding_, valid_predictions.predictions[0], valid_predictions.predictions[1], valid_sample_mapping_, valid_offset_mapping_)

In [46]:
valid_dataset

## Test Dataset

In [47]:
final_predictions = post_processing(test_dataset, test_encoding, predictions.predictions[0], predictions.predictions[1], test_sample_mapping, test_offset_mapping)

In [48]:
key = final_predictions.keys()

In [49]:
for k in key:
    print((k, final_predictions[k]))

In [50]:
sample = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv')
sample.columns

In [51]:

submission = [[str(k), str(final_predictions[k])] for k in key]
with open("submission.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(submission)

sub_df = pd.read_csv('submission.csv', header = None)
sub_df = sub_df.rename(columns={0: 'id', 1: 'PredictionString'})
print(sub_df)
sub_df.to_csv('submission.csv', index=None)

In [52]:
!cat submission.csv