In [115]:
from transformers import CamembertTokenizer, CamembertModel
import torch
import pandas as pd
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt

In [31]:
model = CamembertModel.from_pretrained('sloberta', output_hidden_states = True)
model.eval()

Some weights of the model checkpoint at sloberta were not used when initializing CamembertModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertModel were not initialized from the model checkpoint at sloberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

CamembertModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(32005, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dr

In [3]:
tokenizer = CamembertTokenizer.from_pretrained('sloberta')

In [103]:
df = pd.read_csv("all_manually_annotated.csv", index_col=0)
df.sentence1 = df.sentence1.apply(lambda x: " ".join(x.lower().split()))
df.sentence2 = df.sentence2.apply(lambda x: " ".join(x.lower().split()))
df_pos = df.loc[df["class"] == 1]
df_neg = df.loc[df["class"] == 0]
df_pos

Unnamed: 0,class,word,sentence1,sentence2
1,1,kap,ko pa zdajle ta video vidim me je pa skor kap ...,čingola da te rasvetlim bila je možganska kap ...
7,1,kap,"opis: mezgec franc, ostrovica 7, z značilno ka...",jih je blo...res... kapo dol pred njo...kot kl...
9,1,kap,silver umrl zaradi srčne kapi,"so me prosili za recept zanjo, me je skoraj ka..."
10,1,kap,univerze južna karolina predstavil raziskavo o...,z jetri ali so pred časom preživeli možgansko ...
14,1,kap,tako da smo hodili kar v anorakih in si nadeli...,"kapa nike, sport 2000, 2..999 sit"
...,...,...,...,...
558,1,vila,nekdaj zelo razvpita via veneto. še danes je r...,dediščine. hosting tega potrebuje zaradi posta...
559,1,vila,namesto penziona oliva pride vila čeranič***,"...] obširen borov gozd, posejan z vilami , km..."
560,1,vila,parc. št. 317/3 (s t.i. » vilo vogler«) ter pa...,"vložilo veliko denarja v obnovo vile, ki je po..."
561,1,vila,"ridi a... ratis), trnovlje pri vili : v štandr...",tokrat je le naredil izjemo. spal je v znameni...


In [106]:
def find_word(sent: str, word: str, encod: list):
    sent = sent.lower()
    word = word.lower()
    s_split = sent.split(" ")
    ref = []
    for i in range(len(s_split)):
        if word in s_split[i]:
            ref = tokenizer.encode(s_split[i])[1:-1]
    # print("-------------------------")
    # print(word, sent)
    # print([tokenizer._convert_id_to_token(idx) for idx in encod])
    # print([tokenizer._convert_id_to_token(idx) for idx in ref])
    t = ()
    for i in range(len(encod) - len(ref)):
        if encod[i:i+len(ref)] == ref:
            t = (i, len(ref))
    # print(t)
    return t

In [107]:
pos_diff = []
neg_diff = []

for i in df_pos.index:
    print(i)
    e1 = tokenizer.encode(df_pos["sentence1"][i])
    e2 = tokenizer.encode(df_pos["sentence2"][i])
    t1 = [tokenizer._convert_id_to_token(idx) for idx in e1]
    t2 = [tokenizer._convert_id_to_token(idx) for idx in e2]

    p1 = find_word(df_pos["sentence1"][i], df_pos["word"][i], e1)
    p2 = find_word(df_pos["sentence2"][i], df_pos["word"][i], e2)

    with torch.no_grad():
        token_ids = torch.tensor(e1).unsqueeze(0)
        vec1 = model(token_ids)[0][0][p1[0]:p1[0]+p1[1]].numpy()

        token_ids = torch.tensor(e2).unsqueeze(0)
        vec2 = model(token_ids)[0][0][p2[0]:p2[0]+p2[1]].numpy()
    m1 = np.mean(vec1, axis=0)
    m2 = np.mean(vec2, axis=0)
    res = spatial.distance.cosine(m1, m2)
    pos_diff.append(res)

for i in df_neg.index:
    print(i)
    e1 = tokenizer.encode(df_neg["sentence1"][i])
    e2 = tokenizer.encode(df_neg["sentence2"][i])
    t1 = [tokenizer._convert_id_to_token(idx) for idx in e1]
    t2 = [tokenizer._convert_id_to_token(idx) for idx in e2]

    p1 = find_word(df_neg["sentence1"][i], df_neg["word"][i], e1)
    p2 = find_word(df_neg["sentence2"][i], df_neg["word"][i], e2)

    with torch.no_grad():
        token_ids = torch.tensor(e1).unsqueeze(0)
        vec1 = model(token_ids)[0][0][p1[0]:p1[0]+p1[1]].numpy()

        token_ids = torch.tensor(e2).unsqueeze(0)
        vec2 = model(token_ids)[0][0][p2[0]:p2[0]+p2[1]].numpy()
    m1 = np.mean(vec1, axis=0)
    m2 = np.mean(vec2, axis=0)
    res = spatial.distance.cosine(m1, m2)
    neg_diff.append(res)

1
7
9
10
14
15
18
19
21
23
24
27
29
34
37
38
42
44
47
48
49
51
52
54
59
60
61
63
65
66
69
75
76
78
79
84
86
87
89
94
95
96
97
98
100
104
108
110
116
117
118
120
121
122
124
130
132
134
140
143
150
154
155
156
158
159
160
161
163
164
165
169
170
171
173
174
176
177
178
179
181
182
183
184
188
191
192
194
197
198
199
201
205
209
211
215
219
221
236
238
246
248
250
253
254
255
256
258
259
260
262
263
264
266
267
268
269
270
271
272
273
274
277
278
281
286
287
288
289
290
292
293
295
299
302
303
304
305
311
312
313
314
316
319
321
322
328
330
338
339
340
343
344
346
361
367
373
376
379
381
382
383
387
388
391
392
398
406
407
408
410
413
418
420
423
425
426
435
436
437
443
444
452
453
458
459
461
465
466
469
473
479
483
484
486
487
493
495
497
498
499
500
501
503
505
507
508
509
510
511
512
515
516
519
523
529
533
534
536
537
543
545
547
548
549
550
551
553
555
557
558
559
560
561
562
0
2
3
4
5
6
8
11
12
13
16
17
20
22
25
26
28
30
31
32
33
35
36
39
40
41
43
45
46
50
53
55
56
57
58
62
64
67


In [108]:
print(pos_diff)

[0.1984514594078064, 0.12522941827774048, 0.2790430188179016, 0.04555028676986694, 0.3045370578765869, 0.22638201713562012, 0.09362548589706421, 0.11762714385986328, 0.17041808366775513, 0.23228400945663452, 0.13727205991744995, 0.18299657106399536, 0.18083405494689941, 0.2283814549446106, 0.1679520606994629, 0.23694181442260742, 0.13129925727844238, 0.27282917499542236, 0.26738518476486206, 0.4935457706451416, 0.18435275554656982, 0.2513570785522461, 0.28291600942611694, 0.10375416278839111, 0.22308778762817383, 0.12436413764953613, 0.146839439868927, 0.09777981042861938, 0.21679949760437012, 0.07107490301132202, 0.26673680543899536, 0.2895112633705139, 0.2322484254837036, 0.3847319483757019, 0.2706790566444397, 0.22896736860275269, 0.1597813367843628, 0.1517520546913147, 0.253340482711792, 0.5861902236938477, 0.18491631746292114, 0.24220794439315796, 0.22174084186553955, 0.15714776515960693, 0.11929458379745483, 0, 0.1457226276397705, 0, 0, 0, 0, 0.050098419189453125, 0, 0, 0, 0, 0, 

In [111]:
test1 = list(filter(lambda a: a != 0, neg_diff))
test1

[0.22642028331756592,
 0.08742767572402954,
 0.3150207996368408,
 0.4057033061981201,
 0.4303714632987976,
 0.24218475818634033,
 0.26036709547042847,
 0.3097383975982666,
 0.2721598744392395,
 0.16493719816207886,
 0.30629873275756836,
 0.21964067220687866,
 0.42639774084091187,
 0.22052240371704102,
 0.5796404182910919,
 0.32620400190353394,
 0.3555590510368347,
 0.3998594284057617,
 0.5124630331993103,
 0.22470486164093018,
 0.37188881635665894,
 0.33760082721710205,
 0.2722521424293518,
 0.2094528079032898,
 0.2941167950630188,
 0.36723679304122925,
 0.38135039806365967,
 0.4375753402709961,
 0.20782411098480225,
 0.44180864095687866,
 0.5077777206897736,
 0.5036455988883972,
 0.3188434839248657,
 0.5013689696788788,
 0.29860955476760864,
 0.1792352795600891,
 0.3976130485534668,
 0.3613291382789612,
 0.21388602256774902,
 0.32297587394714355,
 0.1005294919013977,
 0.4449079632759094,
 0.2628830075263977,
 0.15967309474945068,
 0.26440006494522095,
 0.3273389935493469,
 0.351040363

In [120]:
print(np.mean(pos_diff, axis=0))
print(max(pos_diff))
print(min(pos_diff))
print()
print(np.mean(test1, axis=0))
print(max(test1))
print(min(test1))

print()
print(np.std(pos_diff, axis=0))
print(np.std(test1, axis=0))
#0.16

0.08830528310004701
0.5861902236938477
0

0.25600204226516543
0.7955758422613144
0.0805174708366394

0.11009421949562004
0.1125905883986405
