In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets sacremoses huggingface_hub tokenizers sentencepiece

Collecting datasets
  Downloading datasets-1.14.0-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 5.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.9 MB/s 
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.2 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 37.1 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 56.3 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3 MB)
[K     |█████████████████████████████

In [None]:
cd /content/drive/MyDrive/VLSP2021-MRC-BLANC

/content/drive/.shortcut-targets-by-id/1WQcuxpxeN4XEvs7OGyXFqtypm6n8iNvd/VLSP2021-MRC-BLANC


## Predict 10-fold

In [None]:
!python run_squad.py \
  --model_type xlm-roberta \
  --model_name_or_path ./models/finetuned10_models/xlm-roberta-large-fold_1 \
  --do_eval \
  --predict_file ./data/private_test_data/private_test_syllable.json \
  --per_gpu_train_batch_size 4 \
  --per_gpu_eval_batch_size 4 \
  --learning_rate 2e-5 \
  --num_train_epochs 3.0 \
  --max_seq_length 384 \
  --max_answer_length 500 \
  --max_query_length 128 \
  --doc_stride 128 \
  --geometric_p 0.7 \
  --window_size 2 \
  --lmb 0.4 \
  --logging_steps 500 \
  --save_steps 500 \
  --version_2_with_negative \
  --cache_dir  ./results/private_test_result/10_fold_new/xlm-roberta-large-fold_1
  --output_dir ./results/private_test_result/10_fold_new/xlm-roberta-large-fold_1

In [None]:
!python run_squad.py \
  --model_type rembert \
  --model_name_or_path ./models/finetuned10_models/rembert-fold_1 \
  --do_eval \
  --predict_file ./data/private_test_data/private_test_syllable.json \
  --per_gpu_eval_batch_size 2 \
  --learning_rate 2e-5 \
  --num_train_epochs 3.0 \
  --max_answer_length 500 \
  --max_seq_length 384 \
  --max_query_length 128 \
  --doc_stride 128 \
  --geometric_p 0.7 \
  --window_size 2 \
  --lmb 0.4 \
  --logging_steps 500 \
  --save_steps 0 \
  --version_2_with_negative \
  --overwrite_cache \
  --cache_dir ./results/private_test_result/10_fold_new/rembert-fold_1 \
  --output_dir ./results/private_test_result/10_fold_new/rembert-fold_1\

## Ensemble 10-fold

In [None]:
input_nbest_files_rembert = [f"./results/private_test_result/10_fold/rembert-fold_{i+1}/nbest_predictions_.json" for i in range(10)]
input_nbest_files_xlm = [f"./results/private_test_result/10_fold/xlm-roberta-large-fold_{i+1}/nbest_predictions_.json" for i in range(10)]
input_nbest_files=input_nbest_files_rembert + input_nbest_files_xlm

In [None]:
import collections
import json

idx = 0
best_cof = [0.5/10 for _ in range(10)] + [0.5/10 for _ in range(10)]
 
all_nbest = collections.OrderedDict()
for input_file in input_nbest_files:
    with open(input_file, "r") as reader:
        input_data = json.load(reader, strict=False)
        for (key, entries) in input_data.items():
            if key not in all_nbest:
                all_nbest[key] = collections.defaultdict(float)
            for entry in entries:
                all_nbest[key][(entry["text"], entry["new_context"])] += best_cof[idx] * entry["probability"]
    idx += 1

output_predictions = {}
for (key, entry_map) in all_nbest.items():
    sorted_texts = sorted(
        entry_map.keys(), key=lambda x: entry_map[x], reverse=True)
    best_text = sorted_texts[0]
    output_predictions[key] = best_text

In [None]:
with open("/content/results_pred.json", "w", encoding='utf-8') as write_file:
  json.dump(output_predictions, write_file, indent=4, ensure_ascii=False)

In [None]:
raw_data = json.load(open('./data/private_test_data/private_test_syllable.json'))

new_data_train = {'version': "Top 1", 'data': []}
for id_a, dt in enumerate(raw_data['data']):
  new_data_train['data'].append({"title": dt['title'], "paragraphs": []})
  for para in dt['paragraphs']:
    for qa in para['qas']:
      question = qa['question']
      id = qa['id']

      new_context = output_predictions[id][1]
      answer = output_predictions[id][0]
      is_impossible = True
      if answer == '':
        continue

      answers = [{'answer_start': 0, 'text': answer}]
      qas_ = [{'question': question, 'answers': [], 'id': qa['id'], 'is_impossible': is_impossible}]
      new_data_train['data'][id_a]["paragraphs"].append({'context':new_context, 'qas':qas_   })


In [None]:
with open("./data/top1_data/private_data_top1.json", "w", encoding='utf-8') as write_file:
  json.dump(new_data_train, write_file, indent=4, ensure_ascii=False) 

## Predict 10-fold top1

In [None]:
!python run_squad.py \
  --model_type xlm-roberta \
  --model_name_or_path ./models/finetuned10_top1_models/xlm-roberta-large-fold_1 \
  --do_eval \
  --predict_file ./data/private_test_data/private_test_syllable.json \
  --per_gpu_train_batch_size 4 \
  --per_gpu_eval_batch_size 4 \
  --learning_rate 2e-5 \
  --num_train_epochs 3.0 \
  --max_seq_length 384 \
  --max_answer_length 500 \
  --max_query_length 128 \
  --doc_stride 128 \
  --geometric_p 0.7 \
  --window_size 2 \
  --lmb 0.4 \
  --logging_steps 500 \
  --save_steps 500 \
  --version_2_with_negative \
  --cache_dir  ./results/private_test_result/10_fold_top1/xlm-roberta-large-fold_1
  --output_dir ./results/private_test_result/10_fold_top1/xlm-roberta-large-fold_1

In [None]:
!python run_squad.py \
  --model_type rembert \
  --model_name_or_path ./models/finetuned10_top1_models/rembert-fold_1 \
  --do_eval \
  --predict_file ./data/private_test_data/private_test_syllable.json \
  --per_gpu_eval_batch_size 2 \
  --learning_rate 2e-5 \
  --num_train_epochs 3.0 \
  --max_answer_length 500 \
  --max_seq_length 384 \
  --max_query_length 128 \
  --doc_stride 128 \
  --geometric_p 0.7 \
  --window_size 2 \
  --lmb 0.4 \
  --logging_steps 500 \
  --save_steps 0 \
  --version_2_with_negative \
  --overwrite_cache \
  --cache_dir ./results/private_test_result/10_fold_top1/rembert-fold_1 \
  --output_dir ./results/private_test_result/10_fold_top1/rembert-fold_1\

## Ensemble 10-fold top1

In [None]:
input_nbest_files_xlm = [f"./results/private_test_result/10_fold_top1/xlm-roberta-large-fold_{i+1}/nbest_predictions_.json" for i in range(10)]
input_nbest_files_rembert = [f"./results/private_test_result/10_fold_top1/rembert-fold_{i+1}/nbest_predictions_.json" for i in range(10)]
input_nbest_files=input_nbest_files_rembert + input_nbest_files_xlm #Tổng hợp các file nbest_predictions_

In [None]:
import collections
import json

idx = 0
best_cof = [0.5/10 for _ in range(10)] + [0.5/10 for _ in range(10)]
 
all_nbest = collections.OrderedDict()
for input_file in input_nbest_files:
    with open(input_file, "r") as reader:
        input_data = json.load(reader, strict=False)
        for (key, entries) in input_data.items():
            if key not in all_nbest:
                all_nbest[key] = collections.defaultdict(float)
            for entry in entries:
                all_nbest[key][entry["text"]] += best_cof[idx] * entry["probability"]
    idx += 1

output_predictions = {}
for (key, entry_map) in all_nbest.items():
    sorted_texts = sorted(
        entry_map.keys(), key=lambda x: entry_map[x], reverse=True)
    best_text = sorted_texts[0]
    output_predictions[key] = best_text

In [None]:
file_out = "/content/results_pred_2.json"
with open(file_out, 'w', encoding='utf-8') as outfile:
    json.dump(output_predictions, outfile ,ensure_ascii=False,indent=4)

## Assign null answers among duplicate answers using top1 models

In [None]:
pred_10fold = json.load(open("/content/results_pred.json"))
pred_10fold_top1 = json.load(open("/content/results_pred_2.json"))

In [None]:
def group_ans_in_context(test_file):
  public_test = json.load(open(test_file))['data']
  i = 0
  data_id = {}
  for dt in public_test:
    for para in dt['paragraphs']:
      a = []
      for qa in para['qas']:
        a.append(qa['id'])
      data_id[i] = a
      i += 1
  return data_id
 
def find_same_predict_answer(data_id, data ):
    occurrences = lambda s, lst: (i for i,e in enumerate(lst) if e == s)
    dct_final = {}
    for key, values in data_id.items():
        predict = []
        for value in values:
          predict.append(data[value])
        lst_same_ans = []
        flag = False
        for value in values:
          if data[value][0] != "":
            lst = list(occurrences(data[value], predict))
            if len(lst) > 1:
              d = []
              for i in lst:
                flag = True
                d.append(values[i])
              if d not in lst_same_ans:
                lst_same_ans.append(d)
        if flag == True:
          dct_final[key] = lst_same_ans
        flag = False
    return dct_final

def find_answer_same_predict_ans(prediction, test_file_path):
    data_id = group_ans_in_context(test_file_path) # Group answers id in the same context together
    dct_final = find_same_predict_answer(data_id, prediction) # Find the probability in the same answer in each context
    return dct_final

same_ans = find_answer_same_predict_ans(pred_10fold,"./data/private_test_data/private_test_syllable.json")

In [None]:
list_null_top1 = [k for k, v in pred_10fold_top1.items() if v == ""]
null_list_same_ans = []
for _, values in same_ans.items():
  for value in values:
    for v in value:
      if v in list_null_top1:
        null_list_same_ans.append(v)

In [None]:
i = 0
new_data = {}
for id, ans in pred_10fold.items():
  if id in null_list_same_ans:
    new_data[id] = ""
  else:
    new_data[id] = pred_10fold[id][0]

In [None]:
file_out = "/content/results.json"
with open(file_out, 'w', encoding='utf-8') as outfile:
    json.dump(new_data, outfile ,ensure_ascii=False,indent=4)