# Fast text - TripleM

## Load Libraries

In [57]:
import zipfile
import gensim
import numpy as np
import json
import random
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralCoclustering
from scipy.stats import gaussian_kde
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering, SpectralClustering, DBSCAN
import torch
from transformers import CamembertTokenizer, CamembertModel


## Load sloBERTa 2.0 model

In [60]:
tokenizer = CamembertTokenizer.from_pretrained("src/pretrained models/BERT/sloBERTa2/sloberta.2.0.transformers")
model = CamembertModel.from_pretrained("src/pretrained models/BERT/sloBERTa2/sloberta.2.0.transformers", output_hidden_states = True)
model.eval()

Some weights of the model checkpoint at /mydrive/sloBERTa2/sloberta.2.0.transformers were not used when initializing CamembertModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertModel were not initialized from the model checkpoint at /mydrive/sloBERTa2/sloberta.2.0.transformers and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably T

CamembertModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(32005, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dr

## Load data

In [None]:
with open('/mydrive/data.json') as json_file:
    data = json.load(json_file)
with open('/mydrive/data_lema.json') as json_file:
    data_lema = json.load(json_file)
with open('/mydrive/data_pos.json') as json_file:
    data_len = json.load(json_file)

words = np.load("/mydrive/words.npy")

## Prepare sentences

In [63]:
min_number_of_words = 8
min_neighbor_distance = 6

In [188]:
number_of_sentences_to_use = 3000
our_word = "golf"

for keyword in [our_word]:
    all_sentences2=data[keyword][:min(number_of_sentences_to_use, len(data[keyword]))]
    all_sentences_lema2=data_lema[keyword][:min(number_of_sentences_to_use, len(data[keyword]))]

    all_sentences = []
    all_sentences_lema = []
    for sentence, sentence_lema in zip(all_sentences2, all_sentences_lema2):
      if len(sentence) >= min_number_of_words:
        if sentence_lema not in all_sentences_lema:
          all_sentences.append(sentence)
          all_sentences_lema.append(sentence_lema)



    print(len(all_sentences))
    
    all_embeddings=np.zeros((len(all_sentences),768))
    #all_embeddings = []
    
    for i in range(len(all_sentences)): #iterate through the sentences for the given keyword
        sentence = ' '.join(all_sentences_lema[i])
        keyword_position = all_sentences_lema[i].index(keyword)

        marked_text = "[CLS] " + sentence + " [SEP]"

        tokenized_text = tokenizer.tokenize(marked_text)
        segments_ids = [1] * len(tokenized_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        with torch.no_grad():
          outputs = model(tokens_tensor, segments_tensors)
          hidden_states = outputs[2]
          token_embeddings = torch.stack(hidden_states, dim=0)
          token_embeddings = torch.squeeze(token_embeddings, dim=1)
          token_embeddings = token_embeddings.permute(1,0,2)

          token_vecs_sum = []

          for token in token_embeddings:
              sum_vec = torch.sum(token[-4:], dim=0)
              
              token_vecs_sum.append(sum_vec)

          all_embeddings[i,:]=token_vecs_sum[keyword_position].numpy()
          
    
    break #this break means that we terminate on the first word
    
#~np.all(all_embeddings == 0, axis=1) this checks if there are any only-zero rows meaning that we did not have any word embedding for that sentence

2064


In [189]:
#Some utility functions which are used
def cos_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def remove_diplicate_sentences(a):
    b_set = set(map(tuple,a))  #need to convert the inner lists to tuples so they are hashable
    b = list(map(list,b_set)) #Now convert tuples back into lists (maybe unnecessary?)
    return b

In [190]:
distances={}
for i in range(all_embeddings.shape[0]):
    for j in range(i+1,all_embeddings.shape[0],1):
        distances[str(i)+'-'+str(j)] = cos_similarity(all_embeddings[i,:], all_embeddings[j,:])

In [191]:
distances = dict(sorted(distances.items(), key=lambda x:x[1]))

## Analyze worst 50 distances

In [192]:
distances_keys = list(distances.keys())
already_were = [] #to je zato ker se stavke ponavljajo in ne zanimajo nas iste kombinacije znova
count=0
for key in distances_keys[0:20]:
    print(distances[key])
    first, second = map(int, key.split('-'))
    first_sent, second_sent = ' '.join(all_sentences[first]), ' '.join(all_sentences[second])
    
    if first_sent+second_sent in already_were:
        continue
    
    count+=1
    already_were.append(first_sent+second_sent)
    print(first_sent+'\n'+second_sent)
    print('-----')
    if count==20:
        break

-0.026531199134910272
7.00 Atletika Svetovno prvenstvo Atene Grčija prenos 9.30 Tenis ATP turnir Cincinnati ZDA 11.00 Cart indycar Lexington Ohio ZDA 12.00 Motociklizem Euro Open prenos 14.00 Atletika Svetovno prvenstvo Atene Grčija 15.00 Golf prenos 16.30 Atletika Svetovno prvenstvo Atene Grčija prenos 20.00 Touring Car 20.30 Cart indycar Lexington Ohio ZDA 21.00 Cart indycar Lexington Ohio ZDA prenos dirke 23.00 Atletika Svetovno prvenstvo Atene Grčija 0.00 Tenis ATP turnir Cincinnati ZDA 2.00 Konec
GOLF letnik 1979 dobro ohranjen prodam za 144.000 SIT # 041 652 056
-----
-0.0245271390703069
V noči na minuli ponedeljek je bil s parkirišča v Ul. V. prekomorske brigade v Celju ukraden osebni avtomobil znamke VW Golf serije III kovinsko sive barve reg. Št. CE V0 800
GOLF letnik 1979 dobro ohranjen prodam za 144.000 SIT # 041 652 056
-----
-0.024135835647854022
GOLF letnik 1979 dobro ohranjen prodam za 144.000 SIT # 041 652 056
Najprej v audija 35 letnega državljana Bosne in Hercegovine 

In [193]:
distances = dict(sorted(distances.items(), key=lambda x:x[1]))

In [194]:
distances_keys = list(distances.keys())
already_were = [] #to je zato ker se stavke ponavljajo in ne zanimajo nas iste kombinacije znova
count=0
for key in distances_keys[0:20]:
    print(distances[key])
    first, second = map(int, key.split('-'))
    first_sent, second_sent = ' '.join(all_sentences[first]), ' '.join(all_sentences[second])
    
    if first_sent+second_sent in already_were:
        continue
    
    count+=1
    already_were.append(first_sent+second_sent)
    print(first_sent+'\n'+second_sent)
    print('-----')
    if count==20:
        break

-0.026531199134910272
7.00 Atletika Svetovno prvenstvo Atene Grčija prenos 9.30 Tenis ATP turnir Cincinnati ZDA 11.00 Cart indycar Lexington Ohio ZDA 12.00 Motociklizem Euro Open prenos 14.00 Atletika Svetovno prvenstvo Atene Grčija 15.00 Golf prenos 16.30 Atletika Svetovno prvenstvo Atene Grčija prenos 20.00 Touring Car 20.30 Cart indycar Lexington Ohio ZDA 21.00 Cart indycar Lexington Ohio ZDA prenos dirke 23.00 Atletika Svetovno prvenstvo Atene Grčija 0.00 Tenis ATP turnir Cincinnati ZDA 2.00 Konec
GOLF letnik 1979 dobro ohranjen prodam za 144.000 SIT # 041 652 056
-----
-0.0245271390703069
V noči na minuli ponedeljek je bil s parkirišča v Ul. V. prekomorske brigade v Celju ukraden osebni avtomobil znamke VW Golf serije III kovinsko sive barve reg. Št. CE V0 800
GOLF letnik 1979 dobro ohranjen prodam za 144.000 SIT # 041 652 056
-----
-0.024135835647854022
GOLF letnik 1979 dobro ohranjen prodam za 144.000 SIT # 041 652 056
Najprej v audija 35 letnega državljana Bosne in Hercegovine 

## Analyze best 50 distances

In [195]:
distances_keys = list(distances.keys())
already_were = [] #to je zato ker se stavke ponavljajo in ne zanimajo nas iste kombinacije znova
count=0
for key in distances_keys[-20:]:
    print(distances[key])
    first, second = map(int, key.split('-'))
    first_sent, second_sent = ' '.join(all_sentences[first]), ' '.join(all_sentences[second])
    if first_sent+second_sent in already_were:
        continue
    
    count+=1
    already_were.append(first_sent+second_sent)
    print(first_sent+'\n'+second_sent)
    print('-----')
    if count==20:
        break

0.9994905229022459
GOLF JX D letnik 1990 registriran do 9 2000 5V bel dobro ohranjen prodam # 068 73 105
GOLF 1.4 CL letnik 1993 42.000 km 4 vrata rdeč ohranjen prodam # 068 44 784
-----
0.9995049472332599
GOLF III letnik 1992 1.8 bencin prodam # 068 76 488
GOLF D II 5V prodam # 031 791 479
-----
0.9995055147274595
GOLF JX D letnik 1990 registriran do 9 2000 5V bel dobro ohranjen prodam # 068 73 105
GOLF JX D letnik 1991 kovinsko temno sive barve 5V 5 prestav dobro ohranjen prodam + 068 22 742
-----
0.9995302171876991
GOLF III letnik 1992 1.8 bencin prodam # 068 76 488
GOLF JXD letnik 1989 prodam # 068 84 035
-----
0.9995509752950834
GOLF letnik 1994 1.3 prodam # 07 3083650
GOLF D letnik 1989 prodam # 07 49 78 006
-----
0.9995511690905442
GOLF B letnik 1987 registriran do 12 98 rdeč prodam # 068 74 020
GOLF JXD letnik 1986 moder registriran do marca 2000 prodam # 068 50 077
-----
0.9995524176361613
GOLF D II 5V prodam # 031 791 479
GOLF 1.3 letnik 1986 5V prodam # 068 81 148
-----
0.99

## Construct similarity distance matrix

In [196]:
#print(all_embeddings.shape)
distance_matrix=(pairwise_distances(all_embeddings,metric="cosine"))
similarity_matrix=1-distance_matrix


#density plot
similarity_matrix_flatten = similarity_matrix.reshape(len(similarity_matrix)**2)
density = gaussian_kde(similarity_matrix_flatten)
density.covariance_factor = lambda : .5
len(similarity_matrix_flatten)
#plt.plot(similarity_matrix_flatten, density(similarity_matrix_flatten))
#plt.show()


4260096

In [197]:
np.max(distance_matrix)

1.0265311991349102

In [198]:
np.min(similarity_matrix)

-0.0265311991349102

In [199]:
best_distances_indices=[]

best_scores_indices=[]
for i in list(distances.keys()):
    first, second = i.split('-')
    
    best_scores_indices.append(first)
    best_scores_indices.append(second)


best_scores_indices=(np.unique(best_scores_indices))
best_scores_indices = best_scores_indices.astype(int)
all_embeddings_selected = all_embeddings[best_scores_indices, :]

distance_matrix=(pairwise_distances(all_embeddings_selected,metric="cosine"))

In [200]:
all_sentences_selected = []
for i in best_scores_indices:
    all_sentences_selected.append(all_sentences[i])

## Clustering

In [201]:
clusters = AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=2).fit(distance_matrix)
print(Counter(clusters.labels_))

Counter({0: 1862, 1: 202})


In [202]:

import pandas as pd
dataset = np.hstack((all_embeddings_selected, np.transpose([clusters.labels_])))
columns = list(range(dataset.shape[1]))
columns[-1] = "y"
df = pd.DataFrame(dataset, columns = columns)
df.to_csv(f"/mydrive/results/{our_word}_hierarhical_sloBERTa.csv", index = False)


In [203]:
for i,sent in enumerate(all_sentences):
  print(i,":", ' '.join(sent))

0 : JESENICE Na letošnjem dobrodelnem mednarodnem turnirju v golfu na Bledu so se člani Lions clubs International Bled odločili za zbiranje denarja v človekoljubne namene
1 : Preklopni mobilnik vsebuje igri ZioGolf golf in Metalion strelska s podporo igram v treh dimenzijah pa zagotavlja igralno izkušnjo ki je v mobilnikih za zdaj sila redka
2 : Ti gospodje igrajo zdaj golf ali se vozijo z jahtami po morju
3 : Potem so pri Renaultu ugotovili da bi bilo pametno narediti tudi manjši enoprostorski avtomobil na osnovi najbolj prodajanega razreda v Evropi golf astra megane in prav na osnovi megana so naredili scenic
4 : Poleg turana in do neke mere tudi cadyja life je pred kratkim poslal na trg še golf plus ki je za nekaj centimetrov višji in večji kot golf
5 : Prav zato ker Amonovi vedo da želi sodoben turist ob vinski tradiciji in kulinaričnih užitkih še dodatna doživetja se ob njihovi kleti razprostira razgibano igrišče za golf
6 : O njem se ve le malo baje ima rad golf in stare pisalne 

In [204]:
for i,label in enumerate(clusters.labels_):
    if label==1:
        print(i,':', ' '.join(all_sentences_selected[i]))

12 : Golf je že leta pri Slovencih zelo priljubljen tudi zato ker ga lahko uporablja družina navsezadnje meri v dolžino zajetnih 4,2 metra
30 : Golf za telo in dušo Lilijana Kenda lastnica galerije Lala pravi da je golf odlična rekreacija
49 : Golf je ena največjih dejavnosti v turistični industriji na svetu
53 : GOLF letnik 1989 prodam # 031 641 549
67 : Golf Istra projekt golfskih igrišč v Sečovljah izvaja pod blagovno znamko Olazabal ocenjen pa je na 10,5 milijonov evrov
69 : GOLF JX D letnik 1990 registriran do 9 2000 5V bel dobro ohranjen prodam # 068 73 105
71 : Golf danes stane toliko kot pred dvajsetimi leti vsebuje pa bistveno več elektronike
99 : Golf v Sloveniji še zmeraj velja za neke vrste buržuazni šport
112 : Golf je vse bolj razširjeno družabno športna panoga s katero se ukvarjajo predvsem petičneži
131 : Golf ima korenine na Škotskem ali bolje rečeno Škotska je dala svetu golf
144 : Golf je ob novem rojstvu znova zrasel za 57 milimetrov v dolžino 24 v širino in 39 v vi