In [29]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper
from allennlp.modules.elmo import Elmo, batch_to_ids
from sklearn.model_selection import KFold

from allennlp.modules.token_embedders.elmo_token_embedder import ElmoTokenEmbedder

import warnings
from typing import Dict

import torch
from allennlp.data import Token, Vocabulary, TokenIndexer, Tokenizer
from allennlp.data.fields import ListField, TextField
from allennlp.data.token_indexers import (
    SingleIdTokenIndexer,
    TokenCharactersIndexer,
    ELMoTokenCharactersIndexer,
    PretrainedTransformerIndexer,
    PretrainedTransformerMismatchedIndexer,
)
from allennlp.data.tokenizers import (
    CharacterTokenizer,
    PretrainedTransformerTokenizer,
    SpacyTokenizer,
    WhitespaceTokenizer,
)
from allennlp.modules.seq2vec_encoders import CnnEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import (
    Embedding,
    TokenCharactersEncoder,
    ElmoTokenEmbedder,
    PretrainedTransformerEmbedder,
    PretrainedTransformerMismatchedEmbedder,
)
from allennlp.nn import util as nn_util

import zipfile
import numpy as np
import json
import random
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import AgglomerativeClustering, SpectralClustering, DBSCAN
from collections import Counter
import matplotlib.pyplot as plt

## Testing pretrained elmo (can be truncated later)

In [14]:
options_file_path="../../slovenian-elmo/options.json"
weight_file_path="../../slovenian-elmo/slovenian-elmo-weights.hdf5"

In [16]:
tokenizer: Tokenizer = WhitespaceTokenizer()
token_indexer: TokenIndexer = ELMoTokenCharactersIndexer()
vocab = Vocabulary()
text = "Kako je stari"
tokens = tokenizer.tokenize(text)
print("ELMo tokens:", tokens)
text_field = TextField(tokens, {"elmo_tokens": token_indexer})
text_field.index(vocab)
token_tensor = text_field.as_tensor(text_field.get_padding_lengths())
print("ELMo tensors:", token_tensor)

# We're using a tiny, toy version of ELMo to demonstrate this.
elmo_embedding = ElmoTokenEmbedder(
    options_file=options_file_path,
    weight_file=weight_file_path)

embedder = BasicTextFieldEmbedder(token_embedders={"elmo_tokens": elmo_embedding})

tensor_dict = text_field.batch_tensors([token_tensor])
embedded_tokens = embedder(tensor_dict)
print("ELMo embedded tokens:", embedded_tokens)

ELMo tokens: [Kako, je, stari]
ELMo tensors: {'elmo_tokens': {'elmo_tokens': tensor([[259,  76,  98, 108, 112, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 107, 102, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 116, 117,  98, 115, 106, 260, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261]])}}
ELMo embedded tokens: tensor([[[-0.1121, -0.0000, -0.1658, 

In [22]:
from allennlp.modules.elmo import Elmo, batch_to_ids

# Note the "1", since we want only 1 output representation for each token.
elmo = Elmo(options_file=options_file_path,
            weight_file=weight_file_path, num_output_representations=1, dropout=0)

# use batch_to_ids to convert sentences to character ids
sentences = [['Kako', 'je', 'stari'], ['Kako', 'je', 'stari'], ['Kako', 'je', 'stari']]
character_ids = batch_to_ids(sentences)

embeddings = elmo(character_ids)
print(embeddings['elmo_representations'])

[tensor([[[-0.0560, -0.0472, -0.0829,  ...,  0.0517, -0.0407,  0.1257],
         [-0.2654,  0.2074,  0.1101,  ..., -0.1543,  0.2586, -0.0537],
         [ 0.0599,  0.1776, -0.4385,  ..., -0.2170,  0.2408, -0.1374]],

        [[-0.0560, -0.0472, -0.0829,  ...,  0.0517, -0.0407,  0.1257],
         [-0.2654,  0.2074,  0.1101,  ..., -0.1543,  0.2586, -0.0537],
         [ 0.0599,  0.1776, -0.4385,  ..., -0.2170,  0.2408, -0.1374]],

        [[-0.0560, -0.0472, -0.0829,  ...,  0.0517, -0.0407,  0.1257],
         [-0.2654,  0.2074,  0.1101,  ..., -0.1543,  0.2586, -0.0537],
         [ 0.0599,  0.1776, -0.4385,  ..., -0.2170,  0.2408, -0.1374]]],
       grad_fn=<CopySlices>)]


In [28]:
len(embeddings['elmo_representations'][0].detach().numpy())

3

## Now the real deal

In [103]:
#Some utility functions which are used
def cos_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def remove_diplicate_sentences(a):
    b_set = set(map(tuple,a))  #need to convert the inner lists to tuples so they are hashable
    b = list(map(list,b_set)) #Now convert tuples back into lists (maybe unnecessary?)
    return b

def convert_to_lowercase(sentences):
    res=[]
    for i in sentences:
        res.append(list(map(str.lower,i)))
    return res

## Load model

In [None]:
# Note the "1", since we want only 1 output representation for each token.
elmo = Elmo(options_file=options_file_path,
            weight_file=weight_file_path, num_output_representations=1, dropout=0)

# use batch_to_ids to convert sentences to character ids
sentences = [['Kako', 'je', 'stari'], ['Kako', 'je', 'stari'], ['Kako', 'je', 'stari']]
character_ids = batch_to_ids(sentences)

embeddings = elmo(character_ids)
print(embeddings['elmo_representations'])

## Load all sentences

In [32]:
with open('../../ccGigaFida/results/data.json') as json_file:
    data = json.load(json_file)
with open('../../ccGigaFida/results/data_lema.json') as json_file:
    data_lema = json.load(json_file)
with open('../../ccGigaFida/results/data_pos.json') as json_file:
    data_len = json.load(json_file)

words = np.load("../../ccGigaFida/words.npy")
words

array(['leto', 'dan', 'konec', 'svet', 'stran', 'mesto', 'šola', 'ura',
       'beseda', 'pot', 'red', 'zakon', 'zadeva', 'srce', 'tema',
       'resnica', 'moški', 'vloga', 'kraj', 'stanje', 'škoda', 'film',
       'večer', 'vrh', 'jutro', 'kazen', 'oblast', 'račun', 'novica',
       'milijon', 'par', 'krog', 'tip', 'punca', 'sila', 'vir', 'las',
       'akcija', 'meter', 'prst', 'kri', 'stik', 'grad', 'znak', 'lik',
       'direktor', 'vodja', 'raven', 'kolo', 'rob', 'gost', 'duh',
       'praznik', 'vest', 'korist', 'vedenje', 'tek', 'kup', 'otok',
       'razstava', 'bitje', 'motor', 'karta', 'nevarnost', 'hitrost',
       'kos', 'zob', 'stroj', 'kamen', 'župan', 'šef', 'vrtec', 'kot',
       'deček', 'avgust', 'tok', 'jezero', 'klop', 'čelo', 'hip', 'kupec',
       'pojav', 'čaj', 'postava', 'dolg', 'standard', 'jesen', 'rak',
       'grob', 'plus', 'les', 'vez', 'polica', 'minus', 'plan', 'posoda',
       'restavracija', 'jok', 'krilo', 'sol', 'rod', 'stres', 'trditev',
       'f

In [150]:
min_number_of_words=8
for keyword in ['rak']:
    all_sentences2=data[keyword][:500]
    all_sentences_lema2=data_lema[keyword][:500]

    all_sentences = []
    all_sentences_lema = []
    for sentence, sentence_lema in zip(all_sentences2, all_sentences_lema2):
        if len(sentence) >= min_number_of_words and sentence_lema not in all_sentences_lema:
            all_sentences.append(sentence)
            all_sentences_lema.append(sentence_lema)
    
    #all_sentences = convert_to_lowercase(all_sentences)
    all_sentences = convert_to_lowercase(all_sentences)
    all_embeddings=np.zeros((len(all_sentences),1024))
    
    character_ids = batch_to_ids(all_sentences)
    embeddings = elmo(character_ids) #rip RAM
    
    for i in range(len(all_sentences)): #iterate through the sentences for the given keyword
        
        keyword_position = all_sentences_lema[i].index(keyword)
        all_embeddings[i,:]=embeddings['elmo_representations'][0].detach().numpy()[i][keyword_position]
        
    
    break #this break means that we terminate on the first word
    
del embeddings 

## Calculate pairwise similarity

In [151]:
#The bigger it is the more similar the senctences 
similarities={}
for i in range(all_embeddings.shape[0]):
    for j in range(i+1,all_embeddings.shape[0],1):
        similarities[str(i)+'-'+str(j)] = cos_similarity(all_embeddings[i,:], all_embeddings[j,:])
        
similarities = dict(sorted(similarities.items(), key=lambda x:x[1]))

## Analyze worst 50 distances

In [152]:
similarities_keys = list(similarities.keys())
for key in similarities_keys[0:5]:
    first_sentence_idx, second_sentence_idx = map(int, key.split('-'))
        
    first_sent, second_sent = ' '.join(all_sentences[first_sentence_idx]), ' '.join(all_sentences[second_sentence_idx])
    
    print("1)"+first_sent+'\n2)'+second_sent)
    print('-----')

1)eden od njih je sedel v čolnu in svetil s plamenico njegov tovariš pa je grabil rake z roko in jih metal v čoln
2)vod. 1. pasteur sedmak eda 2. ovratnik tina turner 3. stereotiper lineal 4. todo epoleta cj as 5. omika kaša rak alva 6. jonija rt ava nika 7. abas nojevo jajce ab 8. ni tridentčanke 9. klokotanje ntare ns 10. astana jako uvodnik 11
-----
1)eden od njih je sedel v čolnu in svetil s plamenico njegov tovariš pa je grabil rake z roko in jih metal v čoln
2)to se zgodi ker večina zdravil proti raku prizadene delovanje kostnega mozga in zmanjša njegovo sposobnost za tvorbo belih krvnih celic ki premagujejo številne vrste okužb
-----
1)eden od njih je sedel v čolnu in svetil s plamenico njegov tovariš pa je grabil rake z roko in jih metal v čoln
2)za ustrezno zdravljenje je treba rak na prostati diagnosticirati dovolj zgodaj saj je ozdravljiv le v začetnem stadiju
-----
1)eden od njih je sedel v čolnu in svetil s plamenico njegov tovariš pa je grabil rake z roko in jih metal v č

In [153]:
similarities_keys = list(similarities.keys())
for key in similarities_keys[-5:]:
    first_sentence_idx, second_sentence_idx = map(int, key.split('-'))
        
    first_sent, second_sent = ' '.join(all_sentences[first_sentence_idx]), ' '.join(all_sentences[second_sentence_idx])
    
    print("1)"+first_sent+'\n2)'+second_sent)
    print('-----')

1)lani smo začeli s projektom mi podpiramo prihodnost ki je namenjen otrokom obolelim za rakom
2)zbran denar tudi z dražbo umetniških del bodo namenili za igrala otrokom obolelih za rakom
-----
1)moja žena boluje za kostnim rakom operacija stane dobre dva milijona tolarjev
2)zraven sebe je imel napis moja žena boluje za kostnim rakom operacija stane dobre dva milijona tolarjev
-----
1)to se mogoče zdi presenetljivo glede na to da nekateri virusi nedvomno povzročajo raka
2)izidi bi bili lahko presenetljivi glede na to da nekateri virusi nedvomno povzročajo raka
-----
1)pomemben dosežek v boju zoper raka na materničnem vratu pomeni cepivo izdelano posebej za preprečevanje nastanka tumorjev v področju spolovil in rodil
2)pomemben dosežek v boju zoper raka na materničnem vratu pomeni cepivo izdelano posebej za preprečevanje nastanka tumorjev
-----
1)kljub temu obstaja domneva da se zmanjša tveganje za nastanek nekaterih pediatričnih rakov pri otrocih ki so bili izpostavljeni okužbam v prvi

## Similarity and distance matrix

In [154]:
#print(all_embeddings.shape)
distance_matrix=(pairwise_distances(all_embeddings,metric="cosine"))
similarity_matrix=1-distance_matrix

#density plot
#similarity_matrix_flatten = similarity_matrix.reshape(len(similarity_matrix)**2)
#density = gaussian_kde(similarity_matrix_flatten)
#density.covariance_factor = lambda : .5
#len(similarity_matrix_flatten)
#plt.plot(similarity_matrix_flatten, density(similarity_matrix_flatten))
#plt.show()

In [None]:
#(Not mandatory) !!!
'''best_scores_indices=[]
for i in list(distances.keys()):
    first, second = i.split('-')
    if distances[i]>0.90:
        best_scores_indices.append(first)
        best_scores_indices.append(second)


best_scores_indices=(np.unique(best_scores_indices))
best_scores_indices = best_scores_indices.astype(int)
all_embeddings_selected = all_embeddings[best_scores_indices, :]

distance_matrix_selected=(pairwise_distances(all_embeddings_selected,metric="cosine"))
print(len(all_embeddings_selected))

all_sentences_selected = []
for i in best_scores_indices:
    all_sentences_selected.append(all_sentences[i])'''

# Clustering

## Agglomerative clustering

In [159]:
# ‘complete’, ‘average’, ‘single’
clusters = AgglomerativeClustering(affinity='precomputed', linkage='single', n_clusters=2).fit(distance_matrix)
print(Counter(clusters.labels_))
# ‘complete’, ‘average’, ‘single’
clusters = AgglomerativeClustering(affinity='precomputed', linkage='average', n_clusters=2).fit(distance_matrix)
print(Counter(clusters.labels_))
# ‘complete’, ‘average’, ‘single’
clusters = AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=3).fit(distance_matrix)
print(Counter(clusters.labels_))

Counter({0: 356, 1: 1})
Counter({0: 337, 1: 20})
Counter({1: 314, 0: 41, 2: 2})


In [162]:
for i,label in enumerate(clusters.labels_):
    if label==2:
        print(i,':', ' '.join(all_sentences[i]))

125 : pav čep rak dar pek rep raca zidar pes top repa poper pismo slap račun večer pesem ščep rudar denar polenta potop raketa gobar pikapolonica odcep radirka dimnikar lupa sapa kopito kopica kaplja lopata toplice
355 : vod. 1. pasteur sedmak eda 2. ovratnik tina turner 3. stereotiper lineal 4. todo epoleta cj as 5. omika kaša rak alva 6. jonija rt ava nika 7. abas nojevo jajce ab 8. ni tridentčanke 9. klokotanje ntare ns 10. astana jako uvodnik 11


## 2. Spectral Clustering

In [163]:
spectral = SpectralClustering(2).fit_predict(distance_matrix)
#spectral = SpectralClustering(2).fit(all_embeddings)
Counter(spectral)
#spectral



Counter({0: 327, 1: 30})

In [164]:
print(Counter(spectral))
for i,label in enumerate(spectral):
    if label==1:
        print(i,':', ' '.join(all_sentences[i]))
        

Counter({0: 327, 1: 30})
17 : ob številnih receptih iz domačih in tujih logov nas nato popelje skozi bogastvo sladkovodnih in morskih rib mehkužcev glavonožcev školjk in rakov celo konzerviranih rib ter nas na prijeten način navdušuje za uživanje vsega tega bogastva hrane iz vode
21 : kalcij je sestavina kalcijevega karbonata caco3 ki je nujno potreben za razvoj apnenčastega ogrodja koral je v oklepih rakov pa tudi v polžjih hišicah in školjčnih lupinah
23 : ščetinozobke je torej treba postopoma privaditi na nove vrste hrane ki je dostopna v akvarističnih trgovinah ali ribarnicah zlasti na odrasle solinske rakce ali artemije mnogoščetince in druge črve koščke školjčnega mesa ter kozice in druge majhne rake
24 : vse vrste sladkovodnih rakov so precej plašne živali in se dobro počutijo le v akvariju kjer imajo dovolj primernih skrivališč
25 : pri pravilni prehrani in drugih ugodnih razmerah se raki levijo v rednih časovnih presledkih
26 : v akvariju s kakovostno morsko vodo je gojenje te