In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper
from allennlp.modules.elmo import Elmo, batch_to_ids
from sklearn.model_selection import KFold

from allennlp.modules.token_embedders.elmo_token_embedder import ElmoTokenEmbedder

import warnings
from typing import Dict

import torch
from allennlp.data import Token, Vocabulary, TokenIndexer, Tokenizer
from allennlp.data.fields import ListField, TextField
from allennlp.data.token_indexers import (
    SingleIdTokenIndexer,
    TokenCharactersIndexer,
    ELMoTokenCharactersIndexer,
    PretrainedTransformerIndexer,
    PretrainedTransformerMismatchedIndexer,
)
from allennlp.data.tokenizers import (
    CharacterTokenizer,
    PretrainedTransformerTokenizer,
    SpacyTokenizer,
    WhitespaceTokenizer,
)
from allennlp.modules.seq2vec_encoders import CnnEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import (
    Embedding,
    TokenCharactersEncoder,
    ElmoTokenEmbedder,
    PretrainedTransformerEmbedder,
    PretrainedTransformerMismatchedEmbedder,
)
from allennlp.nn import util as nn_util

In [26]:
elmo = ElmoTokenEmbedder(
    options_file="../slovenian-elmo/options.json",
    weight_file="../slovenian-elmo/slovenian-elmo-weights.hdf5")

In [30]:
tokenizer: Tokenizer = WhitespaceTokenizer()
token_indexer: TokenIndexer = ELMoTokenCharactersIndexer()
vocab = Vocabulary()
text = "Kako je stari"
tokens = tokenizer.tokenize(text)
print("ELMo tokens:", tokens)
text_field = TextField(tokens, {"elmo_tokens": token_indexer})
text_field.index(vocab)
token_tensor = text_field.as_tensor(text_field.get_padding_lengths())
print("ELMo tensors:", token_tensor)

# We're using a tiny, toy version of ELMo to demonstrate this.
elmo_embedding = ElmoTokenEmbedder(
    options_file="../slovenian-elmo/options.json",
    weight_file="../slovenian-elmo/slovenian-elmo-weights.hdf5")

embedder = BasicTextFieldEmbedder(token_embedders={"elmo_tokens": elmo_embedding})

tensor_dict = text_field.batch_tensors([token_tensor])
embedded_tokens = embedder(tensor_dict)
print("ELMo embedded tokens:", embedded_tokens)

ELMo tokens: [Kako, je, stari]
ELMo tensors: {'elmo_tokens': {'elmo_tokens': tensor([[259,  76,  98, 108, 112, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 107, 102, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 116, 117,  98, 115, 106, 260, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261]])}}
ELMo embedded tokens: tensor([[[-0.1121, -0.0944, -0.0000, 

In [36]:
tokens

[Kako, je, stari]

In [54]:
from allennlp.modules.elmo import Elmo, batch_to_ids

options_file="../slovenian-elmo/options.json",
weight_file="../slovenian-elmo/slovenian-elmo-weights.hdf5"
# Note the "1", since we want only 1 output representation for each token.
elmo = Elmo(options_file="../slovenian-elmo/options.json",
            weight_file="../slovenian-elmo/slovenian-elmo-weights.hdf5", num_output_representations=1, dropout=0)

# use batch_to_ids to convert sentences to character ids
sentences = [['Kako', 'je', 'stari'], ['Kako', 'je', 'stari']]
character_ids = batch_to_ids(sentences)

embeddings = elmo(character_ids)
print(embeddings['elmo_representations'])

[tensor([[[-0.0560, -0.0472, -0.0829,  ...,  0.0517, -0.0407,  0.1257],
         [-0.2654,  0.2074,  0.1101,  ..., -0.1543,  0.2586, -0.0537],
         [ 0.0599,  0.1776, -0.4385,  ..., -0.2170,  0.2408, -0.1374]],

        [[-0.0560, -0.0472, -0.0829,  ...,  0.0517, -0.0407,  0.1257],
         [-0.2654,  0.2074,  0.1101,  ..., -0.1543,  0.2586, -0.0537],
         [ 0.0599,  0.1776, -0.4385,  ..., -0.2170,  0.2408, -0.1374]]],
       grad_fn=<CopySlices>)]


In [56]:
embeddings['elmo_representations'][0].numpy()

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

## Real deal

In [41]:
import zipfile
import numpy as np
import json
import random
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import AgglomerativeClustering, SpectralClustering, DBSCAN
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
#Some utility functions which are used
def cos_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def remove_diplicate_sentences(a):
    b_set = set(map(tuple,a))  #need to convert the inner lists to tuples so they are hashable
    b = list(map(list,b_set)) #Now convert tuples back into lists (maybe unnecessary?)
    return b

## Load all sentences

In [42]:
with open('../ccGigaFida/results/data.json') as json_file:
    data = json.load(json_file)
with open('../ccGigaFida/results/data_lema.json') as json_file:
    data_lema = json.load(json_file)
with open('../ccGigaFida/results/data_pos.json') as json_file:
    data_len = json.load(json_file)

words = np.load("../ccGigaFida/words.npy")
words

array(['leto', 'dan', 'konec', 'svet', 'stran', 'mesto', 'šola', 'ura',
       'beseda', 'pot', 'red', 'zakon', 'zadeva', 'srce', 'tema',
       'resnica', 'moški', 'vloga', 'kraj', 'stanje', 'škoda', 'film',
       'večer', 'vrh', 'jutro', 'kazen', 'oblast', 'račun', 'novica',
       'milijon', 'par', 'krog', 'tip', 'punca', 'sila', 'vir', 'las',
       'akcija', 'meter', 'prst', 'kri', 'stik', 'grad', 'znak', 'lik',
       'direktor', 'vodja', 'raven', 'kolo', 'rob', 'gost', 'duh',
       'praznik', 'vest', 'korist', 'vedenje', 'tek', 'kup', 'otok',
       'razstava', 'bitje', 'motor', 'karta', 'nevarnost', 'hitrost',
       'kos', 'zob', 'stroj', 'kamen', 'župan', 'šef', 'vrtec', 'kot',
       'deček', 'avgust', 'tok', 'jezero', 'klop', 'čelo', 'hip', 'kupec',
       'pojav', 'čaj', 'postava', 'dolg', 'standard', 'jesen', 'rak',
       'grob', 'plus', 'les', 'vez', 'polica', 'minus', 'plan', 'posoda',
       'restavracija', 'jok', 'krilo', 'sol', 'rod', 'stres', 'trditev',
       'f

In [45]:
min_number_of_words=5
for keyword in ['gol']:
    all_sentences2=data[keyword][:100]
    all_sentences_lema2=data_lema[keyword][:100]

    all_sentences = []
    all_sentences_lema = []
    for sentence, sentence_lema in zip(all_sentences2, all_sentences_lema2):
        if len(sentence) >= min_number_of_words and sentence_lema not in all_sentences_lema:
            all_sentences.append(sentence)
            all_sentences_lema.append(sentence_lema)
            
    all_embeddings=np.zeros((len(all_sentences),100))
    
    for i in range(len(all_sentences)): #iterate through the sentences for the given keyword
        keyword_position = all_sentences_lema[i].index(keyword)
        
        