In [1]:
import glob, os
import pickle
from copy import deepcopy

import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModel

# Read data

In [2]:
data_path = "/home/maksym/da-corpora"
os.chdir(data_path)

In [3]:
files = []
for file in glob.glob("*-cl"):
    print(file)
    files.append(open(file, 'r').readlines())

cl-Europarl.en-et.docs.test-cl
cl-EMEA.en-et.docs.test-cl
cl-Europarl.en-et.docs.dev-cl
cl-JRC-Acquis.en-et.docs.dev-cl
cl-EMEA.en-et.docs.dev-cl
cl-OpenSubtitles.en-et.docs.dev-cl
cl-OpenSubtitles.en-et.docs.test-cl
cl-JRC-Acquis.en-et.docs.test-cl


In [4]:
# reduce files to eglish sentences but keep doc ids

sent_index = deepcopy(files)

for ind, f in enumerate(files):
    for i in range(len(f)):
        sent_index[ind][i] = f[i].split('\t')[0]
        f[i] = f[i].split('\t')[1]

# Embed

In [5]:
model = AutoModel.from_pretrained('xlm-roberta-base')
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

In [6]:
# same calss as in embed notebook
class Embedder:
    def __init__(self, model, tokeinzer):    
        self.model = model.cuda()
        self.tokenizer = tokenizer
        
    def embed_batch(self, batch):
        input_ids = self.tokenizer(batch, 
                          return_tensors='pt', 
                          truncation=True, 
                          padding=True, 
                          max_length=100)['input_ids']
        with torch.no_grad():
            res = self.model(input_ids.cuda(), output_hidden_states=True, return_dict=True)
        return res['hidden_states'][7].mean(1).cpu().detach().numpy()


In [7]:
embedder = Embedder(model, tokenizer)

In [8]:
%%time

files_emb = []
for sentences in files:

    sent_emb = []
    
    bs = 1000
    chunks = [sentences[x:x+bs] for x in range(0, len(sentences), bs)]

    for i, chunk in enumerate(chunks):
        print(f"{i} / {len(chunks)}") 
        chunk_emb = embedder.embed_batch(chunk)
        sent_emb.extend(chunk_emb)
    
    files_emb.append(sent_emb)

0 / 4
1 / 4
2 / 4
3 / 4
0 / 4
1 / 4
2 / 4
3 / 4
0 / 4
1 / 4
2 / 4
3 / 4
0 / 4
1 / 4
2 / 4
3 / 4
0 / 4
1 / 4
2 / 4
3 / 4
0 / 4
1 / 4
2 / 4
3 / 4
0 / 4
1 / 4
2 / 4
3 / 4
0 / 4
1 / 4
2 / 4
3 / 4
CPU times: user 48.1 s, sys: 14.4 s, total: 1min 2s
Wall time: 1min 2s


# Cluster

### Sent

In [9]:
kmeans_sent_4 = pickle.load(open("kmeans_sent_4.pkl", 'rb'))
kmeans_sent_16 = pickle.load(open("kmeans_sent_16.pkl", 'rb'))
kmeans_sent_64 = pickle.load(open("kmeans_sent_64.pkl", 'rb'))

In [10]:
fnames = []
for file in glob.glob("*-cl"):
    fnames.append(file)

In [11]:
fnames

['cl-Europarl.en-et.docs.test-cl',
 'cl-EMEA.en-et.docs.test-cl',
 'cl-Europarl.en-et.docs.dev-cl',
 'cl-JRC-Acquis.en-et.docs.dev-cl',
 'cl-EMEA.en-et.docs.dev-cl',
 'cl-OpenSubtitles.en-et.docs.dev-cl',
 'cl-OpenSubtitles.en-et.docs.test-cl',
 'cl-JRC-Acquis.en-et.docs.test-cl']

In [12]:
for fe, fn in zip(files_emb, fnames):
#     clusters_sent_4 = kmeans_sent_4.predict(fe)
#     clusters_sent_16 = kmeans_sent_16.predict(fe)
#     clusters_sent_64 = kmeans_sent_64.predict(fe)
    
    with open(f'{fn}.clusters_sent_4.txt', 'w') as f:
        for item in kmeans_sent_4.predict(fe):
            f.write("%s\n" % item)
    
    with open(f'{fn}.clusters_sent_16.txt', 'w') as f:
        for item in kmeans_sent_16.predict(fe):
            f.write("%s\n" % item)
    
    with open(f'{fn}.clusters_sent_64.txt', 'w') as f:
        for item in kmeans_sent_64.predict(fe):
            f.write("%s\n" % item)
    


### Doc

In [13]:
kmeans_doc_4 = pickle.load(open("kmeans_doc_4.pkl", 'rb'))
kmeans_doc_16 = pickle.load(open("kmeans_doc_16.pkl", 'rb'))
kmeans_doc_64 = pickle.load(open("kmeans_doc_64.pkl", 'rb'))

In [14]:
fnames = []
for file in glob.glob("*-cl"):
    fnames.append(file)

In [15]:
fnames

['cl-Europarl.en-et.docs.test-cl',
 'cl-EMEA.en-et.docs.test-cl',
 'cl-Europarl.en-et.docs.dev-cl',
 'cl-JRC-Acquis.en-et.docs.dev-cl',
 'cl-EMEA.en-et.docs.dev-cl',
 'cl-OpenSubtitles.en-et.docs.dev-cl',
 'cl-OpenSubtitles.en-et.docs.test-cl',
 'cl-JRC-Acquis.en-et.docs.test-cl']

In [16]:
len(sent_index)

8

In [20]:
from collections import defaultdict 

In [21]:
doc2embs_list = []

for sent_ids, sent_embs in zip(sent_index, files_emb):
    doc2embs = defaultdict(list)
    for i, doc_id in enumerate(sent_ids):
        doc2embs[doc_id].append(sent_embs[i])
    doc2embs_list.append(doc2embs)

In [37]:
for d2embs in doc2embs_list:
    for doc_id, embs in d2embs.items():
        d2embs[doc_id] = np.mean(embs, 0)

In [38]:
files_emb = []
for i, f in enumerate(sent_index):
    fe = []
    for doc_id in f:
        fe.append(doc2embs_list[i][doc_id])
    files_emb.append(fe)
        

In [40]:
for fe, fn in zip(files_emb, fnames):
#     clusters_sent_4 = kmeans_sent_4.predict(fe)
#     clusters_sent_16 = kmeans_sent_16.predict(fe)
#     clusters_sent_64 = kmeans_sent_64.predict(fe)
    
    with open(f'{fn}.clusters_doc_4.txt', 'w') as f:
        for item in kmeans_doc_4.predict(fe):
            f.write("%s\n" % item)
    
    with open(f'{fn}.clusters_doc_16.txt', 'w') as f:
        for item in kmeans_doc_16.predict(fe):
            f.write("%s\n" % item)
    
    with open(f'{fn}.clusters_doc_64.txt', 'w') as f:
        for item in kmeans_doc_64.predict(fe):
            f.write("%s\n" % item)
    


In [42]:
from collections import Counter

In [45]:
for fe, fn in zip(files_emb, fnames):
#     clusters_sent_4 = kmeans_sent_4.predict(fe)
#     clusters_sent_16 = kmeans_sent_16.predict(fe)
#     clusters_sent_64 = kmeans_sent_64.predict(fe)
    
    print(fn)
    print(sorted(Counter(kmeans_doc_4.predict(fe)).items()))
    
#     with open(f'{fn}.clusters_doc_16.txt', 'w') as f:
#         for item in kmeans_doc_16.predict(fe):
#             f.write("%s\n" % item)
    
#     with open(f'{fn}.clusters_doc_64.txt', 'w') as f:
#         for item in kmeans_doc_64.predict(fe):
#             f.write("%s\n" % item)
    

cl-Europarl.en-et.docs.test-cl
[(0, 11), (1, 9), (2, 3066), (3, 21)]
cl-EMEA.en-et.docs.test-cl
[(0, 683), (1, 2532), (2, 46), (3, 54)]
cl-Europarl.en-et.docs.dev-cl
[(2, 3716)]
cl-JRC-Acquis.en-et.docs.dev-cl
[(0, 1506), (1, 168), (2, 1326), (3, 5)]
cl-EMEA.en-et.docs.dev-cl
[(0, 1616), (1, 1592), (2, 120), (3, 20)]
cl-OpenSubtitles.en-et.docs.dev-cl
[(3, 3044)]
cl-OpenSubtitles.en-et.docs.test-cl
[(3, 3085)]
cl-JRC-Acquis.en-et.docs.test-cl
[(0, 924), (1, 58), (2, 2208)]
