In [None]:
!git clone https://github.com/nyu-mll/jiant.git

In [None]:
!pip install torch==1.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
# Remove numpy from requirements-no-torch.txt before installing the requirements
with open("jiant/requirements-no-torch.txt", "r") as f:
    lines = f.readlines()
with open("jiant/requirements-no-torch.txt", "w") as f:
    for line in lines:
        if not line.strip("\n").startswith("numpy"):
            f.write(line)

In [None]:
# Remove numpy from requirements-no-torch.txt before running this cell
!pip install -r jiant/requirements-no-torch.txt

In [None]:
import sys
sys.path.insert(0, "/content/jiant")
from ipywidgets import IntProgress

In [None]:
import jiant.proj.main.tokenize_and_cache as tokenize_and_cache
import jiant.proj.main.export_model as export_model
import jiant.proj.main.scripts.configurator as configurator
import jiant.proj.main.runscript as main_runscript
import jiant.shared.caching as caching
import jiant.utils.python.io as py_io
import jiant.utils.display as display
import jiant.scripts.download_data.runscript as downloader
import os
from enum import Enum

**Add Google Drive and Google Sheets credentials**

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()
gc = gspread.authorize(creds)


**Define the parameters**

In [None]:
class CrossLingualType(Enum):
    EN_to_MK = 1
    Combined = 2

In [None]:
TASK_NAME = "multirc"
DATA_DIR = "content/tasks"
MODEL_NAME = "RVN/BERTovski"
RUN_NAME = 8
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 32
EVAL_STEPS = 1000
EPOCHS = 5
LEARNING_RATE = 1e-5
OPTIMIZER = "AdamW"
ADAM_EPSILON=1e-8
MAX_GRAD_NORM=1.2

CROSS_LINGUAL_TYPE = CrossLingualType.Combined

if CROSS_LINGUAL_TYPE == CrossLingualType.EN_to_MK:
    TRAIN_SETS = [f"{TASK_NAME}"]
    TEST_SETS = [f"{TASK_NAME}_mk"]
    TRAIN_SETS_STR = f"{TASK_NAME}_EN"
    TEST_SETS_STR = f"{TASK_NAME}_MK"
    PHASES_EN = ["train"]
    PHASES_MK = ["val", "test"]
else:
    TRAIN_SETS = [f"{TASK_NAME}", f"{TASK_NAME}_mk"]
    TEST_SETS = [f"{TASK_NAME}_mk"]
    TRAIN_SETS_STR = f"{TASK_NAME}_EN, {TASK_NAME}_MK"
    TEST_SETS_STR = f"{TASK_NAME}_MK"
    PHASES_EN = ["train"]
    PHASES_MK = ["train", "val", "test"]

In [None]:
if CROSS_LINGUAL_TYPE == CrossLingualType.EN_to_MK:
    print("OK")

In [None]:
# !rm -r ./models
# !rm -r ./runs
# #!rm -r ./content/tasks

**Download the model and English data, then copy the Macedonian data**

In [None]:
export_model.export_model(
    hf_pretrained_model_name_or_path=f"{MODEL_NAME}",
    output_base_path=f"./models/{MODEL_NAME}",
)

In [None]:
downloader.download_data([TASK_NAME], DATA_DIR)

In [None]:
# !mkdir -p /content/content/tasks/data/copa_mk && cp -r /gdrive/MyDrive/COPA/* /content/content/tasks/data/copa_mk
# !mkdir -p /content/content/tasks/data/boolq_mk && cp -r /gdrive/MyDrive/BoolQ/* /content/content/tasks/data/boolq_mk
!mkdir -p /content/content/tasks/data/multirc_mk && cp -r /gdrive/MyDrive/MultiRC/* /content/content/tasks/data/multirc_mk

In [None]:
# !cp -r /content/content/tasks/configs/copa_config.json /content/content/tasks/configs/copa_mk_config.json
# !cp -r /content/content/tasks/configs/boolq_config.json /content/content/tasks/configs/boolq_mk_config.json
!cp -r /content/content/tasks/configs/multirc_config.json /content/content/tasks/configs/multirc_mk_config.json

**Tokenize and cache English data**

In [None]:
# Tokenize and cache EN data
tokenize_and_cache.main(tokenize_and_cache.RunConfiguration(
    task_config_path=f"/content/content/tasks/configs/{TASK_NAME}_config.json",
    hf_pretrained_model_name_or_path=f"{MODEL_NAME}",
    output_dir=f"./cache/{TASK_NAME}",
    phases=PHASES_EN,
    # max_seq_length=160 #for multirc
    max_seq_length=230 #for multirc, makedonizer
))


In [None]:
row = caching.ChunkedFilesDataCache(f"./cache/{TASK_NAME}/train").load_chunk(0)[0]["data_row"]
print(row.tokens)
# print(row.tokens_list)

**Tokenize and cache Macedonian data**

In [None]:
# Don't forget to change the config file first!
tokenize_and_cache.main(tokenize_and_cache.RunConfiguration(
    task_config_path=f"/content/content/tasks/configs/{TASK_NAME}_mk_config.json",
    hf_pretrained_model_name_or_path=f"{MODEL_NAME}",
    output_dir=f"./cache/{TASK_NAME}_mk",
    phases=PHASES_MK,
    max_seq_length=230 #for multirc
))


In [None]:
row = caching.ChunkedFilesDataCache(f"./cache/{TASK_NAME}_mk/val").load_chunk(0)[0]["data_row"]
print(row.tokens)
# print(row.tokens_list)

**Set the parameters**

In [None]:
jiant_run_config = configurator.SimpleAPIMultiTaskConfigurator(
    task_config_base_path="/content/content/tasks/configs",
    task_cache_base_path="./cache",
    train_task_name_list=TRAIN_SETS,
    train_val_task_name_list=TEST_SETS,
    val_task_name_list=TEST_SETS,
    test_task_name_list=TEST_SETS,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    epochs=EPOCHS
).create_config()
display.show_json(jiant_run_config)

In [None]:
jiant_run_config["taskmodels_config"]["task_to_taskmodel_map"] = {
    f"{TASK_NAME}": f"{TASK_NAME}",
    f"{TASK_NAME}_mk": f"{TASK_NAME}"
}

jiant_run_config["task_cache_config_dict"][f"{TASK_NAME}"] = {
    "train": f"./cache/{TASK_NAME}/train"
}

os.makedirs("./run_configs/", exist_ok=True)
py_io.write_json(jiant_run_config, "./run_configs/jiant_run_config.json")

In [None]:
run_args = main_runscript.RunConfiguration(
    jiant_task_container_config_path="./run_configs/jiant_run_config.json",
    output_dir=f"./runs/{RUN_NAME}",
    hf_pretrained_model_name_or_path=f"{MODEL_NAME}",
    model_path=f"./models/{MODEL_NAME}/model/model.p",
    model_config_path=f"./models/{MODEL_NAME}/model/config.json",
    learning_rate=LEARNING_RATE,
    eval_every_steps=EVAL_STEPS,
    do_train=True,
    do_val=True,
    do_save=True,
    do_save_last=True,
    do_save_best=True,
    keep_checkpoint_when_done=True,
    write_val_preds=True,
    write_test_preds=True,
    save_every_steps=EVAL_STEPS,
    force_overwrite=True
)

**Run the model**

In [None]:
main_runscript.run_loop(run_args)

In [None]:
# Sheet 1: BoolQ (0)
# Sheet 2: COPA (1)
# Sheet 3: MultiRC (2)
# Sheet 4: Multi-task (3)
# Sheet 5: COPA cross-lingual (4)
# Sheet 6: BoolQ cross-lingual (5)
# Sheet 7: MultiRC cross-lingual (6)

# Check if you're opening the right sheet!
worksheet = gc.open('QA_results').get_worksheet(6)
model_num = RUN_NAME + 2
# Don't forget to move to the next row before writing the current results
cell_list = worksheet.range(f'A{model_num}:O{model_num}')

# Multi-task data:
values = [RUN_NAME, MODEL_NAME, TRAIN_BATCH_SIZE, EPOCHS ,LEARNING_RATE, 
          OPTIMIZER, ADAM_EPSILON, MAX_GRAD_NORM, 
          TRAIN_SETS_STR, TEST_SETS_STR,
          0.3245756938022362,
          0.708691844814702, 0.3245756938022362,0.10598111227701994,0.5431702753274525
          ]
         

for idx, cell in enumerate(cell_list):
    cell.value = values[idx]
worksheet.update_cells(cell_list)

**Copy the best models to Google Drive**

In [None]:
# !cp /content/runs/8 -r /gdrive/MyDrive/BestCrossLingualMultiRC/
# !cp /content/runs/8 -r /gdrive/MyDrive/BestCrossLingualBoolQ/
!cp /content/runs/8 -r /gdrive/MyDrive/BestCrossLingualCOPA/