# Read data

In [1]:
data_path = "da-corpora"

In [2]:
import pickle
from copy import deepcopy

import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModel

In [3]:
import glob, os

files = []
os.chdir(data_path)
for file in glob.glob("*.train"):
    print(file)
    files.append(open(file, 'r').readlines())

cl-JRC-Acquis.en-et.docs.train
cl-OpenSubtitles.en-et.docs.train
cl-EMEA.en-et.docs.train
cl-Europarl.en-et.docs.train


In [4]:
# reduce files to eglish sentences but keep doc ids

sent_index = deepcopy(files)

for ind, f in enumerate(files):
    for i in range(len(f)):
        sent_index[ind][i] = f[i].split('\t')[0]
        f[i] = f[i].split('\t')[1]

In [5]:
sentences = [line for file in files for line in file]
sent_index = [line for file in sent_index for line in file]

In [6]:
del files

In [7]:
assert len(sentences) == len(sent_index)

In [8]:
#sentences = sentences[0:100]

# Embed sentences

In [9]:
model = AutoModel.from_pretrained('xlm-roberta-base')
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

In [15]:
def tiny_value_of_dtype(dtype: torch.dtype):
    if not dtype.is_floating_point:
        raise TypeError("Only supports floating point dtypes.")
    if dtype == torch.float or dtype == torch.double:
        return 1e-13
    elif dtype == torch.half:
        return 1e-4
    else:
        raise TypeError("Does not support dtype " + str(dtype))
        
def masked_mean(
    vector: torch.Tensor, mask: torch.BoolTensor, dim: int, keepdim: bool = False
) -> torch.Tensor:
    
    replaced_vector = vector.masked_fill(~mask, 0.0)

    value_sum = torch.sum(replaced_vector, dim=dim, keepdim=keepdim)
    value_count = torch.sum(mask, dim=dim, keepdim=keepdim)
    return value_sum / value_count.float().clamp(min=tiny_value_of_dtype(torch.float))

In [17]:
class Embedder:
    def __init__(self, model, tokeinzer):    
        self.model = model.cuda()
        self.tokenizer = tokenizer
        
    def embed_batch(self, batch):
        batch = self.tokenizer.batch_encode_plus(batch, 
                          return_tensors='pt', 
                          truncation=True, 
                          padding=True, 
                          max_length=100)


        with torch.no_grad():
            res = self.model(
                input_ids=batch.input_ids.cuda(),
                attention_mask=batch.attention_mask.cuda(),
                output_hidden_states=True, 
                return_dict=True
            )
            
            hiddens = res['hidden_states'][7].cpu().detach()
        
        hiddens_sent = masked_mean(vector=hiddens, mask=batch.attention_mask.unsqueeze(2).bool(), dim=1)
        return hiddens_sent.numpy()


In [18]:
embedder = Embedder(model, tokenizer)

In [None]:
%%time

sent_emb = []

bs = 1000
chunks = [sentences[x:x+bs] for x in range(0, len(sentences), bs)]

for i, chunk in enumerate(chunks):
    print(f"{i} / {len(chunks)}") 
    chunk_emb = embedder.embed_batch(chunk)
    sent_emb.extend(chunk_emb)

0 / 1912
1 / 1912
2 / 1912
3 / 1912
4 / 1912
5 / 1912
6 / 1912
7 / 1912
8 / 1912
9 / 1912
10 / 1912
11 / 1912
12 / 1912
13 / 1912
14 / 1912
15 / 1912
16 / 1912
17 / 1912
18 / 1912
19 / 1912
20 / 1912
21 / 1912
22 / 1912
23 / 1912
24 / 1912
25 / 1912
26 / 1912
27 / 1912
28 / 1912
29 / 1912
30 / 1912
31 / 1912
32 / 1912
33 / 1912
34 / 1912
35 / 1912
36 / 1912
37 / 1912
38 / 1912
39 / 1912
40 / 1912
41 / 1912
42 / 1912
43 / 1912
44 / 1912
45 / 1912
46 / 1912
47 / 1912
48 / 1912
49 / 1912
50 / 1912
51 / 1912
52 / 1912
53 / 1912
54 / 1912
55 / 1912
56 / 1912
57 / 1912
58 / 1912
59 / 1912
60 / 1912
61 / 1912
62 / 1912
63 / 1912
64 / 1912
65 / 1912
66 / 1912
67 / 1912
68 / 1912
69 / 1912
70 / 1912
71 / 1912
72 / 1912
73 / 1912
74 / 1912
75 / 1912
76 / 1912
77 / 1912
78 / 1912
79 / 1912
80 / 1912
81 / 1912
82 / 1912
83 / 1912
84 / 1912
85 / 1912
86 / 1912
87 / 1912
88 / 1912
89 / 1912
90 / 1912
91 / 1912
92 / 1912
93 / 1912
94 / 1912
95 / 1912
96 / 1912
97 / 1912
98 / 1912
99 / 1912
100 / 1912

In [23]:
pickle.dump(sent_emb, open("sent_emb.pkl", 'wb'))
pickle.dump(sent_index, open("sent_index.pkl", 'wb'))

In [24]:
sent_emb = pickle.load(open("sent_emb.pkl", 'rb'))
sent_index = pickle.load(open("sent_index.pkl", 'rb'))

# Embed docs

In [25]:
doc2embs = {}

In [26]:
for i, doc_id in enumerate(sent_index):
    doc2embs[doc_id] = []

In [27]:
for i, doc_id in enumerate(sent_index):
    doc2embs[doc_id].append(sent_emb[i])

In [28]:
doc2emb = {}
for doc_id, embs in doc2embs.items():
    doc2emb[doc_id] = np.mean(embs, 0)

In [29]:
doc_emb = list(doc2emb.values())

In [30]:
doc_index = list(doc2emb.keys())

In [31]:
pickle.dump(doc_emb, open("doc_emb.pkl", 'wb'))
pickle.dump(doc_index, open("doc_index.pkl", 'wb'))

In [32]:
doc_emb = pickle.load(open("doc_emb.pkl", 'rb'))
doc_index = pickle.load(open("doc_index.pkl", 'rb'))