# Train XLM-R Downsampling Stategy on sentence translation pairs

In this notebook, you can train the XLM-R model with Downsampling Stategy on sentence translation pairs

In [1]:
#! pip3 install torch==1.5.0 transformers==3.4.0
#! pip install pickle5
#! pip install datasets
#! pip install faiss-gpu cudatoolkit=10.0 -c pytorch

In [1]:
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
import json
import pickle5 as pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch.utils.data as data_utils
import torch
import sys
import os
sys.path.append(os.path.dirname((os.path.abspath(''))))

np.random.seed(42)
from src.models.train_text_encoder import Torch_dataset_mono, compute_metrics
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


## Load Data





In [3]:
binary_dataset_path = "..data/processed/feature_dataframe.json"
path = "..data/processed/europarl_english_german.pkl"
model_used = "xlm-roberta-base"


# read file
with open(binary_dataset_path, 'r') as myfile:
    data=myfile.read()
binary_dataset = json.loads(data)

# Load Data
with open(path, 'rb') as f:
    data = pickle.load(f)

new_training_set = pd.DataFrame(columns=['source_id', 'target_id', 'text_source', 'text_target', 'Translation'])
current_source_id = list(binary_dataset["source_id"].values())
current_target_id = list(binary_dataset["target_id"].values())
new_training_set["text_source"] = data.iloc[current_source_id,:]["text_source"].reset_index(drop=True)
new_training_set["text_target"] = data.iloc[current_target_id,:]["text_target"].reset_index(drop=True)
new_training_set["source_id"] = current_source_id
new_training_set["target_id"] = current_target_id
new_training_set['Translation'] = new_training_set.apply(lambda row : int(row['source_id'] == row['target_id']), axis = 1)

del binary_dataset
del data

# Train XLM-R with Downsampling strategy

In [7]:
train_dataset, test_dataset = train_test_split(new_training_set, test_size=.05, random_state=42)

train_dataset = downsample(train_dataset)
print(train_dataset)
print("Size of training set: {}".format(len(train_dataset)))
print("Size of test set: {}".format(len(test_dataset)))
train_dataset = Torch_dataset_mono(train_dataset)
test_dataset = Torch_dataset_mono(test_dataset)

Class 0 size: 189889
Class 1 size: 19111
After Downsampling:
Class 0 size: 19111
Class 1 size: 19111
        source_id  ...  Translation
173441      15344  ...            0
117955       9795  ...            0
153422      13342  ...            0
8182         8182  ...            1
18397       18397  ...            1
...           ...  ...          ...
85143        6514  ...            0
150201      13020  ...            0
17955       17955  ...            1
79156        5915  ...            0
28860         886  ...            0

[38222 rows x 5 columns]
Size of training set: 38222
Size of test set: 11000


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [None]:
save_model_path = "/content/drive/MyDrive/model_correct_downsampling"
save_log_path = "/content/drive/MyDrive/log_correct_downsampling"

model = AutoModelForSequenceClassification.from_pretrained("../model/model_correct_downsampling/checkpoint-14000", num_labels=2)

#transformers.logging.set_verbosity_info()
training_args = TrainingArguments(
    output_dir=save_model_path,          # output directory
    #overwrite_output_dir=True,
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    warmup_steps=400,                # number of warmup steps for learning rate scheduler
    logging_dir=save_log_path,            # directory for storing logs
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset

    compute_metrics=compute_metrics
)

trainer.train("../model/model_correct_downsampling/checkpoint-14000")



Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1,Log Loss
15000,0.017484,0.044747,0.993636,0.992886,0.939423,0.965415,0.044747
16000,0.017647,0.033347,0.994636,0.987805,0.953876,0.970544,0.033348
17000,0.018798,0.054991,0.992909,0.992886,0.932252,0.961614,0.054992
18000,0.023601,0.027531,0.995364,0.979675,0.968844,0.974229,0.027531
19000,0.026354,0.028442,0.996182,0.99187,0.966337,0.978937,0.028443




Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1,Log Loss
15000,0.017484,0.044747,0.993636,0.992886,0.939423,0.965415,0.044747
16000,0.017647,0.033347,0.994636,0.987805,0.953876,0.970544,0.033348
17000,0.018798,0.054991,0.992909,0.992886,0.932252,0.961614,0.054992
18000,0.023601,0.027531,0.995364,0.979675,0.968844,0.974229,0.027531
19000,0.026354,0.028442,0.996182,0.99187,0.966337,0.978937,0.028443
20000,0.009744,0.035508,0.995273,0.994919,0.954191,0.974129,0.035509
21000,0.000202,0.046436,0.994,0.995935,0.940499,0.967423,0.046437
22000,0.009824,0.035435,0.995545,0.995935,0.956098,0.97561,0.035437


