# Installation

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Download data

In [4]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1dB3X-Wx8qU_5nDG_pxAmLvo5H_sgnHrE' \
 -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1dB3X-Wx8qU_5nDG_pxAmLvo5H_sgnHrE" -O train_wiki.csv && rm -rf /tmp/cookies.txt

In [5]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=11lqipq6ggrgCk8bVxQ4-uuPVMCKN5ebU' \
 -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=11lqipq6ggrgCk8bVxQ4-uuPVMCKN5ebU" -O test_wiki.csv && rm -rf /tmp/cookies.txt

In [6]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1bJo8TagTGKa0uyppQRqsHrKHyYO5tcZc' \
 -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1bJo8TagTGKa0uyppQRqsHrKHyYO5tcZc" -O dev_wiki.csv && rm -rf /tmp/cookies.txt

## For huggingface

In [7]:
!git clone https://github.com/huggingface/transformers 
!pip install -e ".[dev]"
!pip install -r /content/transformers/examples/seq2seq/requirements.txt
!pip install pyarrow==0.17.1
!pip install git+https://github.com/huggingface/transformers

## For Fairseq

In [8]:
!wget https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz
!tar -xzvf /content/mbart.cc25.v2.tar.gz

In [9]:
!apt-get install cmake build-essential pkg-config libgoogle-perftools-dev

In [10]:
!git clone https://github.com/google/sentencepiece.git 

In [None]:
!(cd sentencepiece && mkdir build)

In [11]:
!(cd sentencepiece/build && cmake .. && make && make install && ldconfig -v)

In [12]:
!git clone https://github.com/pytorch/fairseq
!(cd fairseq && pip install --editable ./)

# Data Preprocessing

In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
def df_to_pairs(path, split='train', lib='huggingface'):
  tmp_df = pd.read_csv(path)
  if lib == 'fairseq':
    tmp_df_ru = tmp_df[['target_x', 'target_y']]
    tmp_df_en = tmp_df[['src', 'dst']]
    tmp_df_en.to_csv("/content/data/" + split + '.en.txt', sep=str('\t'), index=False, header=False)
    tmp_df_ru.to_csv("/content/data/" + split + '.ru.txt', sep=str('\t'), index=False, header=False)
  elif lib == 'huggingface':
    tmp_df_source = tmp_df['target_x']
    tmp_df_source = tmp_df_source.append(tmp_df['src'], ignore_index=True)
    tmp_df_source = tmp_df_source.reindex(np.random.permutation(tmp_df_source.index))
    tmp_df_source.to_csv("/content/data/" + split + ".source")
    tmp_df_target = tmp_df['target_y']
    tmp_df_target = tmp_df_target.append(tmp_df['dst'], ignore_index=True)
    tmp_df_target = tmp_df_target.reindex(np.random.permutation(tmp_df_target.index))
    tmp_df_target.to_csv("/content/data/" + split + ".target")

In [None]:
df_to_pairs('/content/train_wiki.csv')
df_to_pairs('/content/dev_wiki.csv', split='valid')
df_to_pairs('/content/test_wiki.csv', split='test')

# FairSeq

In [None]:
# !SPM="/content/sentencepiece/build/src/spm_encode"
# !BPE_MODEL="/content/mbart.cc25.v2/sentence.bpe.model"
# !DATA_DIR="/content/data"
# !SRC=en
# !TGT=ru
!/content/sentencepiece/build/src/spm_encode --model=/content/mbart.cc25.v2/sentence.bpe.model < /content/data/train.en > /content/data/train.spm.en &
!/content/sentencepiece/build/src/spm_encode --model=/content/mbart.cc25.v2/sentence.bpe.model  < /content/data/train.ru > /content/data/train.spm.ru 
!/content/sentencepiece/build/src/spm_encode --model=/content/mbart.cc25.v2/sentence.bpe.model  < /content/data/valid.en > /content/data/valid.spm.en &
!/content/sentencepiece/build/src/spm_encode --model=/content/mbart.cc25.v2/sentence.bpe.model  < /content/data/valid.ru > /content/data/valid.spm.ru &
!/content/sentencepiece/build/src/spm_encode --model=/content/mbart.cc25.v2/sentence.bpe.model  < /content/data/test.en > /content/data/test.spm.en &
!/content/sentencepiece/build/src/spm_encode --model=/content/mbart.cc25.v2/sentence.bpe.model < /content/data/test.ru > /content/data/test.spm.ru &

In [13]:
# PREPROCESSED_DATA_DIR=/directory/to/save/preprocessed/data
# DICT=/path/to/downloaded/mbart/model/directory/dict.txt
!fairseq-preprocess \
  --source-lang en \
  --target-lang ru \
  --trainpref /content/data/train.spm \
  --validpref /content/data/valid.spm \
  --testpref /content/data/test.spm \
  --destdir /content/data \
  --thresholdtgt 0 \
  --thresholdsrc 0 \
  --srcdict /content/mbart.cc25.v2/dict.txt \
  --tgtdict /content/mbart.cc25.v2/dict.txt \
  --workers 70

In [14]:
# !(cd /content/fairseq  &&  git checkout `git rev-list -1 --before="Dec 30 2020" master`)

In [15]:
# !(cd /content/fairseq/ && pip install --editable . && python setup.py build develop)

In [None]:
# PRETRAIN=/path/to/downloaded/mbart/model/directory/model.pt
!langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
!CUDA_VISIBLE_DEVICES=0,1,2,3
# !SAVE_DIR=/path/to/save/model/checkpoint
!fairseq-train /content/data \
  --encoder-normalize-before --decoder-normalize-before \
  --arch mbart_large --layernorm-embedding \
  --task translation_from_pretrained_bart \
  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
  --lr-scheduler polynomial_decay --lr 3e-05 --warmup-updates 2500 --total-num-update 54725  \
  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
  --max-tokens 1024 --update-freq 2 \
  --source-lang en --target-lang ru \
  --batch-size 16 \
  --validate-interval 1 \
  --patience 3 \
  --max-epoch 25 \
  --save-interval 5 --keep-last-epochs 10 --keep-best-checkpoints 2 \
  --seed 42 --log-format simple --log-interval 500 \
  --restore-file /content/mbart.cc25.v2/model.pt \
  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
  --ddp-backend no_c10d \
  --langs ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN \
  --scoring bleu \
  --save-dir /content/checkpoints > train_log.txt &

Traceback (most recent call last):
  File "/usr/local/bin/fairseq-train", line 33, in <module>
    sys.exit(load_entry_point('fairseq', 'console_scripts', 'fairseq-train')())
  File "/usr/local/bin/fairseq-train", line 25, in importlib_load_entry_point
    return next(matches).load()
  File "/usr/local/lib/python3.6/dist-packages/importlib_metadata/__init__.py", line 96, in load
    module = import_module(match.group('module'))
  File "/usr/lib/python3.6/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 994, in _gcd_import
  File "<frozen importlib._bootstrap>", line 971, in _find_and_load
  File "<frozen importlib._bootstrap>", line 955, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 665, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 678, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_rem

# HuggingFace


In [None]:
from tqdm.auto import tqdm
import torch
from torch import nn
from torch.utils.data import DistributedSampler, RandomSampler

import logging
import os
import sys
import copy
from dataclasses import dataclass, field
from typing import Optional
import numpy as np

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    HfArgumentParser,
    MBartTokenizer,
    TrainingArguments,
    set_seed,
    Trainer,
)

from transformers.trainer_utils import EvaluationStrategy
from transformers.optimization import (
    Adafactor,
    AdamW,
    get_constant_schedule,
    get_constant_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    get_cosine_with_hard_restarts_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    get_polynomial_decay_schedule_with_warmup,
)

from typing import Any, Dict, Optional, Tuple, Union

import torch
from torch import nn
from torch.utils.data import DistributedSampler, RandomSampler

from transformers import PreTrainedModel, Trainer, logging
from transformers.file_utils import is_torch_tpu_available

(I used previous seq2seq examples, because it is what I worked with before.)

((Also, to make it work without problems, I commented lines that were about git in "transformers/examples/legacy/seq2seq/"))

In [None]:
!python '/content/transformers/examples/legacy/seq2seq/finetune_trainer.py' \
    --data_dir '/content/data' \
    --model_name_or_path 'google/mt5-small' \
    --tokenizer_name  'google/mt5-small' \
    --output_dir "/content/drive/MyDrive/rsse_model" \
    --num_train_epochs 1 --do_train \
    --n_val 5000 \
    --freeze_embeds --freeze_encoder \
    # --overwrite_output_dir 
        # --model_name_or_path 'google/mt5-small' \

2021-02-21 15:56:35.544452: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
02/21/2021 15:56:38 - INFO - __main__ -   Training/evaluation parameters Seq2SeqTrainingArguments(output_dir='/content/drive/MyDrive/rsse_model', overwrite_output_dir=False, do_train=True, do_eval=None, do_predict=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, logging_dir='runs/Feb21_15-56-38_745c92ba1a0d', logging_strategy=<LoggingStrategy.STEPS: 'steps'>, logging_first_step=False, logging_step