In [1]:
#! pip3 install torch==1.5.0 transformers==3.4.0
#! pip install pickle5
#! pip install datasets
#! pip install faiss-gpu cudatoolkit=10.0 -c pytorch

# Train XLM-R Weighted Loss Stategy on sentence translation pairs

In this notebook, you can train the XLM-R model with Weighted Loss Stategy on sentence translation pairs

In [1]:
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
import json
import pickle5 as pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch.utils.data as data_utils
import torch
import sys
import os
sys.path.append(os.path.dirname((os.path.abspath(''))))

np.random.seed(42)
from src.models.train_text_encoder import Torch_dataset_mono, compute_metrics, WeightedLossTrainer
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
binary_dataset_path = "/content/drive/MyDrive/CLIR/europarl_data/feature_dataframe.json"
path = "/content/drive/MyDrive/CLIR/europarl_data/europarl_english_german.pkl"
model_used = "xlm-roberta-base"

# read file
with open(binary_dataset_path, 'r') as myfile:
    data=myfile.read()
binary_dataset = json.loads(data)

# Load Data
with open(path, 'rb') as f:
    data = pickle.load(f)
    
new_training_set = pd.DataFrame(columns=['source_id', 'target_id', 'text_source', 'text_target', 'Translation'])
current_source_id = list(binary_dataset["source_id"].values())
current_target_id = list(binary_dataset["target_id"].values())
new_training_set["text_source"] = data.iloc[current_source_id,:]["text_source"].reset_index(drop=True)
new_training_set["text_target"] = data.iloc[current_target_id,:]["text_target"].reset_index(drop=True)
new_training_set["source_id"] = current_source_id
new_training_set["target_id"] = current_target_id
new_training_set['Translation'] = new_training_set.apply(lambda row : int(row['source_id'] == row['target_id']), axis = 1)

del binary_dataset
del data

In [8]:
test_size=.05
cutoff = int(test_size*len(new_training_set))
test_dataset = new_training_set.iloc[:cutoff, :]
train_dataset = new_training_set.iloc[cutoff:, :]
train_dataset.head(n=33)

Unnamed: 0,source_id,target_id,text_source,text_target,Translation
30002,1000,13515,There remain too many uncertainties regarding ...,"Es war mir ein besonderes Anliegen, dass Zahle...",0
30009,1000,2413,There remain too many uncertainties regarding ...,Herr Präsident! Indien hat in fast jeder Hinsi...,0
1000,1000,1000,There remain too many uncertainties regarding ...,Er enthält nach wie vor zu viele Unsicherheite...,1
30000,1000,15193,There remain too many uncertainties regarding ...,"Ich möchte anregen, von der Möglichkeit gemäß ...",0
30007,1000,9109,There remain too many uncertainties regarding ...,Sie haben eine etwas andere Betrachtungsweise ...,0
30001,1000,15135,There remain too many uncertainties regarding ...,In den vergangenen zehn Jahren untermauerte di...,0
30005,1000,4883,There remain too many uncertainties regarding ...,Die Kommission pflichtet Herrn Mitchell auch d...,0
30003,1000,8020,There remain too many uncertainties regarding ...,"Wir verfügen über keinen Mechanismus, durch de...",0
30006,1000,15594,There remain too many uncertainties regarding ...,"Zunächst einmal zu Herrn Posselts Frage, ob da...",0
30008,1000,10057,There remain too many uncertainties regarding ...,Wir waren deshalb so unnachgiebig bei manchen ...,0


In [9]:
print("Size of training set: {}".format(len(train_dataset)))
print("Size of test set: {}".format(len(test_dataset)))
train_dataset = Torch_dataset_mono(train_dataset)
test_dataset = Torch_dataset_mono(test_dataset)

Size of training set: 209000
Size of test set: 11000


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [10]:
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification

save_model_path = "../model/model_downsampled"
save_log_path = "../model/log_downsampled"

model = AutoModelForSequenceClassification.from_pretrained("../model/model_downsampled/checkpoint-12000", num_labels=2)


#transformers.logging.set_verbosity_info()
training_args = TrainingArguments(
    output_dir=save_model_path,          # output directory
    #overwrite_output_dir=True,
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=11,  # batch size per device during training
    per_device_eval_batch_size=11,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    warmup_steps=400,                # number of warmup steps for learning rate scheduler
    logging_dir=save_log_path,            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000
)

trainer = WeightedLossTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    compute_metrics=compute_metrics,
    eval_dataset=test_dataset
)

trainer.train("../model/model_downsampled/checkpoint-12000")



Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1,Log Loss
13000,0.439534,0.066144,0.987455,0.879208,0.982301,0.9279,0.066144
14000,0.357364,0.023718,0.996273,0.970297,0.9889,0.97951,0.023718
15000,0.035782,0.025231,0.996182,0.981188,0.977318,0.979249,0.025231
16000,0.087091,0.02908,0.993273,0.987129,0.942344,0.964217,0.02908
17000,0.000305,0.019167,0.996182,0.976238,0.982072,0.979146,0.019167
18000,0.383691,0.021048,0.997,0.980198,0.987039,0.983607,0.021048
19000,0.000409,0.019357,0.996818,0.980198,0.985075,0.98263,0.019357




Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1,Log Loss
13000,0.439534,0.066144,0.987455,0.879208,0.982301,0.9279,0.066144
14000,0.357364,0.023718,0.996273,0.970297,0.9889,0.97951,0.023718
15000,0.035782,0.025231,0.996182,0.981188,0.977318,0.979249,0.025231
16000,0.087091,0.02908,0.993273,0.987129,0.942344,0.964217,0.02908
17000,0.000305,0.019167,0.996182,0.976238,0.982072,0.979146,0.019167
18000,0.383691,0.021048,0.997,0.980198,0.987039,0.983607,0.021048
19000,0.000409,0.019357,0.996818,0.980198,0.985075,0.98263,0.019357


TrainOutput(global_step=19000, training_loss=0.038409096165707236)