In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip3 install torch==1.5.0 transformers==3.4.0
! pip install faiss-gpu cudatoolkit=10.0 -c pytorch

Collecting torch==1.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/76/58/668ffb25215b3f8231a550a227be7f905f514859c70a65ca59d28f9b7f60/torch-1.5.0-cp37-cp37m-manylinux1_x86_64.whl (752.0MB)
[K     |████████████████████████████████| 752.0MB 23kB/s 
[?25hCollecting transformers==3.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 39.9MB/s 
Collecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/35/e7/edf655ae34925aeaefb7b7fcc3dd0887d2a1203ee6b0df4d1170d1a19d4f/tokenizers-0.9.2-cp37-cp37m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 38.7MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp3

## Load Data, tokenize and split into train/val/test



In [2]:
import pickle

path = "/content/drive/MyDrive/CLIR/europarl_data/dataset_duc.pkl"
model_used = "xlm-roberta-base"

# Load Data
with open(path, 'rb') as f:
    data = pickle.load(f)
data.head()

Unnamed: 0,text_source,text_target,Translation
0,"Mention was made of citizenship, but I felt th...","Zwar wird der Bürgersinn erwähnt, doch wurde m...",1
1,"Yesterday, Mr Barroso rightly said that we nee...",Kommissionspräsident Barroso hat gestern zu Re...,1
2,Some of them even gestured to me to resort to ...,Jemand forderte mich sogar mit einer Geste auf...,1
3,Despite the excellent work of Mr Hernández Mol...,Trotz der ausgezeichneten Arbeit des Ausschuss...,1
4,"(GA) Mr President, I also welcome the Taoiseac...",(GA) Herr Präsident! Auch ich möchte den Premi...,1


In [3]:
import torch
from transformers import AutoTokenizer


class Torch_dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
        sentence_pairs = data.apply(lambda row: [row["text_source"], row["text_target"]], axis=1).tolist()
        self.encodings = tokenizer(sentence_pairs, padding="max_length", truncation="longest_first", return_tensors="pt")
        self.labels = data["Translation"].tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
      

In [4]:
from sklearn.model_selection import train_test_split
import torch.utils.data as data_utils
import torch
import numpy as np


train, test = train_test_split(data, test_size=.2)



# Convert to pytorch data
'''train_target = torch.tensor(train["Translation"].values.astype(np.float32))
train_data = torch.tensor(train["tokenized_sequence_pair"].values.tolist())
train_tensor = data_utils.TensorDataset(train_data, train_target) 

# Convert to pytorch data
test_target = torch.tensor(test["Translation"].values.astype(np.float32))
test_data = torch.tensor(test["tokenized_sequence_pair"].values.tolist())
test_tensor = data_utils.TensorDataset(test_data, test_target) '''

train_dataset = Torch_dataset(train)
test_dataset = Torch_dataset(test)

In [75]:
test_dataset.__len__()

4000

## Train Model

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [6]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()

  del sys.path[0]


Step,Training Loss
10,0.704225
20,0.688133
30,0.703396
40,0.707666
50,0.698685
60,0.697302
70,0.690036
80,0.661416
90,0.625328
100,0.637036


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


TrainOutput(global_step=2000, training_loss=0.19823226928710938)

## Evaluate Model/Predict

In [12]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
    test_dataset=test_dataset
)

# Evaluate on Test Set
trainer.evaluate()

  del sys.path[0]


{'eval_accuracy': 0.98925, 'eval_loss': 0.059095073491334915}

In [21]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

# Do Prediction on Test Set
predictions = trainer.predict(test_dataset)

  del sys.path[0]


In [48]:
import pandas
pandas.set_option('display.max_colwidth', None)

def logit2prob(logit):
  odds = np.exp(logit)
  prob = odds/(1+odds)
  return prob

def prob2label(prod):
  return (prod > 0.5)

pred_logit = [pred[1] for pred in predictions.predictions]
pred_prob = logit2prob(pred_logit)
pred_label = prob2label(pred_prob)

test['prediction_prob'] = pred_prob
test['prediction'] = pred_label

In [65]:
# Show some predictions
test.head(n=10)

Unnamed: 0,text_source,text_target,Translation,prediction,prediction_prob
6333,But on balance it is impossible to agree with ...,"Da muß die Kommission beobachten, und notfalls...",0,False,0.035325
4976,"Mr President, on issues concerning the third p...","Ich begrüße es, dass die Bekämpfung des Mensch...",0,False,0.02623
4276,We are organising campaigns throughout the Eur...,Derzeit organisieren wir Kampagnen in ganz Eur...,1,True,0.98349
3423,That cannot be allowed to happen!,Das darf nicht sein!,1,True,0.983445
1795,Regarding the issue of Christianity versus Isl...,In Valencia wurden in großem Umfang Immobilien...,0,False,0.026791
6468,The two most important examples of this are th...,Die zwei wichtigsten Beispiele dafür sind die ...,1,True,0.983754
7107,My question to Council was tabled as No 3. Whe...,Meine Anfrage an den Rat hatte die Nr. 3. Als ...,1,True,0.983761
3425,"In conclusion, I would like to express my hope...","Ich will damit sagen, dass das, was ich heute ...",0,False,0.029201
9601,"Nowadays, everyone must have the necessary rea...",Unsere Bürger möchten weder mit krebserregende...,0,False,0.027699
8874,"For this frankness I am grateful, for it gives...","Ich bin dankbar für diese Offenheit, weil sie ...",1,True,0.983705


In [76]:
# Show wrong predictions
test.loc[test['Translation'] != test['prediction']]

Unnamed: 0,text_source,text_target,Translation,prediction,prediction_prob
6533,"It also confirms the fact that the issue of the price of drugs is at the heart of the debates on access to treatment and the importance of research and development and, above all, it stresses the need to focus efforts on diseases that particularly affect the South, as well as forgotten illnesses.",,1,False,0.02809
4246,"We should be encouraged by the fact that the issue is back where it belongs, that is at scientific and at veterinary level.",,1,False,0.028881
7997,Are we prepared to tolerate that for the sake of a slice of salami in our sandwich?,"Während wir hier unsere Aussprache führen, stirbt ein Pferd infolge einer solchen unwürdigen Behandlung von Tieren.",1,False,0.027997
3553,,Viertens: Die demographische Entwicklung macht uns zu schaffen.,1,False,0.033435
1603,The next item is the vote.,Als nächster Punkt folgt die Abstimmungsstunde.,1,False,0.168404
868,"I give the floor to the rapporteur, Mr Herman.",Vorschlag für einen Beschluß des Rates über die Vertretung und die Festlegung von Standpunkten der Gemeinschaft auf internationaler Ebene im Zusammenhang mit der Wirtschafts- und Währungsunion (KOM(98)0637 - C4-0638/98-00/0785(COS)). Zunächst hat Herr Herman in seiner Eigenschaft als Berichterstatter das Wort.,1,False,0.02729
5674,,"(IT) Frau Präsidentin, Herr Wathelet, Herr Kommissar, meine Damen und Herren!",1,False,0.18827
3480,"We do not wish to sanction but rather to assist. Not to assist for the sake of assisting, but to assist so that the commitments may be met.","Wir wollen keine Strafen verhängen, sondern Hilfe leisten, keine Hilfe um der Hilfe willen, sondern zur Unterstützung der Entwicklungsländer bei ihren Bemühungen um Einhaltung ihrer Verpflichtungen.",1,False,0.155421
332,The matter has been deferred.,Das Thema ist vertagt.,1,False,0.420807
6126,Another criterion should be that the young people who take part do not find themselves subjected to repressive measures on their return home.,Außerdem dürfen die Teilnehmer bei ihrer Rückkehr keinen Repressalien ausgesetzt werden.,1,False,0.039022
