# sloBERTa - TripleM

## Load Libraries

In [43]:
import zipfile
import gensim
import numpy as np
import json
import random
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralCoclustering
from scipy.stats import gaussian_kde
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering, SpectralClustering, DBSCAN
import torch
from transformers import CamembertTokenizer, CamembertModel


## Load sloBERTa 2.0 model

In [None]:
tokenizer = CamembertTokenizer.from_pretrained("src/pretrained models/BERT/sloBERTa2/sloberta.2.0.transformers")
model = CamembertModel.from_pretrained("src/pretrained models/BERT/sloBERTa2/sloberta.2.0.transformers", output_hidden_states = True)
model.eval()

## Load data

In [None]:
with open('src/corpus/data.json') as json_file:
    data = json.load(json_file)
with open('src/corpus/data_lema.json') as json_file:
    data_lema = json.load(json_file)
with open('src/corpus/data_pos.json') as json_file:
    data_len = json.load(json_file)

words = np.load("src/corpus/words.npy")

## Prepare sentences

In [None]:
min_number_of_words = 8
min_neighbor_distance = 6

In [None]:
number_of_sentences_to_use = 3000
our_word = "golf"

for keyword in [our_word]:
    all_sentences2=data[keyword][:min(number_of_sentences_to_use, len(data[keyword]))]
    all_sentences_lema2=data_lema[keyword][:min(number_of_sentences_to_use, len(data[keyword]))]

    all_sentences = []
    all_sentences_lema = []
    for sentence, sentence_lema in zip(all_sentences2, all_sentences_lema2):
      if len(sentence) >= min_number_of_words:
        if sentence_lema not in all_sentences_lema:
          all_sentences.append(sentence)
          all_sentences_lema.append(sentence_lema)



    print(len(all_sentences))
    
    all_embeddings=np.zeros((len(all_sentences),768))
    #all_embeddings = []
    
    for i in range(len(all_sentences)): #iterate through the sentences for the given keyword
        sentence = ' '.join(all_sentences_lema[i])
        keyword_position = all_sentences_lema[i].index(keyword)

        marked_text = "[CLS] " + sentence + " [SEP]"

        tokenized_text = tokenizer.tokenize(marked_text)
        segments_ids = [1] * len(tokenized_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        with torch.no_grad():
          outputs = model(tokens_tensor, segments_tensors)
          hidden_states = outputs[2]
          token_embeddings = torch.stack(hidden_states, dim=0)
          token_embeddings = torch.squeeze(token_embeddings, dim=1)
          token_embeddings = token_embeddings.permute(1,0,2)

          token_vecs_sum = []

          for token in token_embeddings:
              sum_vec = torch.sum(token[-4:], dim=0)
              
              token_vecs_sum.append(sum_vec)

          all_embeddings[i,:]=token_vecs_sum[keyword_position].numpy()
          
    
    break #this break means that we terminate on the first word
    
#~np.all(all_embeddings == 0, axis=1) this checks if there are any only-zero rows meaning that we did not have any word embedding for that sentence

In [None]:
#Some utility functions which are used
def cos_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def remove_diplicate_sentences(a):
    b_set = set(map(tuple,a))  #need to convert the inner lists to tuples so they are hashable
    b = list(map(list,b_set)) #Now convert tuples back into lists (maybe unnecessary?)
    return b

In [None]:
distances={}
for i in range(all_embeddings.shape[0]):
    for j in range(i+1,all_embeddings.shape[0],1):
        distances[str(i)+'-'+str(j)] = cos_similarity(all_embeddings[i,:], all_embeddings[j,:])

In [None]:
distances = dict(sorted(distances.items(), key=lambda x:x[1]))

## Analyze worst 50 distances

In [None]:
distances_keys = list(distances.keys())
already_were = [] #to je zato ker se stavke ponavljajo in ne zanimajo nas iste kombinacije znova
count=0
for key in distances_keys[0:20]:
    print(distances[key])
    first, second = map(int, key.split('-'))
    first_sent, second_sent = ' '.join(all_sentences[first]), ' '.join(all_sentences[second])
    
    if first_sent+second_sent in already_were:
        continue
    
    count+=1
    already_were.append(first_sent+second_sent)
    print(first_sent+'\n'+second_sent)
    print('-----')
    if count==20:
        break

In [None]:
distances = dict(sorted(distances.items(), key=lambda x:x[1]))

In [None]:
distances_keys = list(distances.keys())
already_were = [] #to je zato ker se stavke ponavljajo in ne zanimajo nas iste kombinacije znova
count=0
for key in distances_keys[0:20]:
    print(distances[key])
    first, second = map(int, key.split('-'))
    first_sent, second_sent = ' '.join(all_sentences[first]), ' '.join(all_sentences[second])
    
    if first_sent+second_sent in already_were:
        continue
    
    count+=1
    already_were.append(first_sent+second_sent)
    print(first_sent+'\n'+second_sent)
    print('-----')
    if count==20:
        break

## Analyze best 50 distances

In [None]:
distances_keys = list(distances.keys())
already_were = [] #to je zato ker se stavke ponavljajo in ne zanimajo nas iste kombinacije znova
count=0
for key in distances_keys[-20:]:
    print(distances[key])
    first, second = map(int, key.split('-'))
    first_sent, second_sent = ' '.join(all_sentences[first]), ' '.join(all_sentences[second])
    if first_sent+second_sent in already_were:
        continue
    
    count+=1
    already_were.append(first_sent+second_sent)
    print(first_sent+'\n'+second_sent)
    print('-----')
    if count==20:
        break

## Construct similarity distance matrix

In [None]:
#print(all_embeddings.shape)
distance_matrix=(pairwise_distances(all_embeddings,metric="cosine"))
similarity_matrix=1-distance_matrix


#density plot
similarity_matrix_flatten = similarity_matrix.reshape(len(similarity_matrix)**2)
density = gaussian_kde(similarity_matrix_flatten)
density.covariance_factor = lambda : .5
len(similarity_matrix_flatten)
#plt.plot(similarity_matrix_flatten, density(similarity_matrix_flatten))
#plt.show()


In [None]:
np.max(distance_matrix)

In [None]:
np.min(similarity_matrix)

In [None]:
best_distances_indices=[]

best_scores_indices=[]
for i in list(distances.keys()):
    first, second = i.split('-')
    
    best_scores_indices.append(first)
    best_scores_indices.append(second)


best_scores_indices=(np.unique(best_scores_indices))
best_scores_indices = best_scores_indices.astype(int)
all_embeddings_selected = all_embeddings[best_scores_indices, :]

distance_matrix=(pairwise_distances(all_embeddings_selected,metric="cosine"))

In [None]:
all_sentences_selected = []
for i in best_scores_indices:
    all_sentences_selected.append(all_sentences[i])

## Clustering

In [None]:
clusters = AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=2).fit(distance_matrix)
print(Counter(clusters.labels_))

In [None]:

import pandas as pd
dataset = np.hstack((all_embeddings_selected, np.transpose([clusters.labels_])))
columns = list(range(dataset.shape[1]))
columns[-1] = "y"
df = pd.DataFrame(dataset, columns = columns)
df.to_csv(f"/mydrive/results/{our_word}_hierarhical_sloBERTa.csv", index = False)


In [None]:
for i,sent in enumerate(all_sentences):
  print(i,":", ' '.join(sent))

In [None]:
for i,label in enumerate(clusters.labels_):
    if label==1:
        print(i,':', ' '.join(all_sentences_selected[i]))