# ELMO - TripleM

## Install requirements

In [1]:
!pip3 install allennlp
!pip uninstall google-cloud
!pip install --upgrade google-cloud-storage

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting allennlp
  Downloading allennlp-2.9.3-py3-none-any.whl (719 kB)
[K     |████████████████████████████████| 719 kB 5.2 MB/s 
Collecting huggingface-hub>=0.0.16
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 2.9 MB/s 
[?25hCollecting typer>=0.4.1
  Downloading typer-0.4.1-py3-none-any.whl (27 kB)
Collecting jsonnet>=0.10.0
  Downloading jsonnet-0.18.0.tar.gz (592 kB)
[K     |████████████████████████████████| 592 kB 52.0 MB/s 
Collecting cached-path<1.2.0,>=1.0.2
  Downloading cached_path-1.1.2-py3-none-any.whl (26 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 42.6 MB/s 
Collecting tensorboardX>=1.2
  Downloading tensorboardX-2.5-py2.py3-none-any.whl (125 kB)
[K     |████████████████

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper
from allennlp.modules.elmo import Elmo, batch_to_ids
from sklearn.model_selection import KFold

from allennlp.modules.token_embedders.elmo_token_embedder import ElmoTokenEmbedder

import warnings
from typing import Dict

import torch
from allennlp.data import Token, Vocabulary, TokenIndexer, Tokenizer
from allennlp.data.fields import ListField, TextField
from allennlp.data.token_indexers import (
    SingleIdTokenIndexer,
    TokenCharactersIndexer,
    ELMoTokenCharactersIndexer,
    PretrainedTransformerIndexer,
    PretrainedTransformerMismatchedIndexer,
)
from allennlp.data.tokenizers import (
    CharacterTokenizer,
    PretrainedTransformerTokenizer,
    SpacyTokenizer,
    WhitespaceTokenizer,
)
from allennlp.modules.seq2vec_encoders import CnnEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import (
    Embedding,
    TokenCharactersEncoder,
    ElmoTokenEmbedder,
    PretrainedTransformerEmbedder,
    PretrainedTransformerMismatchedEmbedder,
)
from allennlp.nn import util as nn_util

import zipfile
import numpy as np
import json
import random
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import AgglomerativeClustering, SpectralClustering, DBSCAN
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
options_file_path="/content/gdrive/MyDrive/NLP_TripleM/slovenian-elmo/options.json"
weight_file_path="/content/gdrive/MyDrive/NLP_TripleM/slovenian-elmo/slovenian-elmo-weights.hdf5"

## Utility functions

In [None]:
def cos_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def remove_diplicate_sentences(a):
    b_set = set(map(tuple,a))  #need to convert the inner lists to tuples so they are hashable
    b = list(map(list,b_set)) #Now convert tuples back into lists (maybe unnecessary?)
    return b

def convert_to_lowercase(sentences):
    res=[]
    for i in sentences:
        res.append(list(map(str.lower,i)))
    return res

## Load model

In [None]:
# Note the "1", since we want only 1 output representation for each token.
elmo = Elmo(options_file=options_file_path,
            weight_file=weight_file_path, num_output_representations=1, dropout=0)

## Load data

In [None]:
with open('/content/gdrive/MyDrive/NLP_TripleM/ccGigaFida/results/data.json') as json_file:
    data = json.load(json_file)
with open('/content/gdrive/MyDrive/NLP_TripleM/ccGigaFida/results/data_lema.json') as json_file:
    data_lema = json.load(json_file)
with open('/content/gdrive/MyDrive/NLP_TripleM/ccGigaFida/results/data_pos.json') as json_file:
    data_len = json.load(json_file)

words = np.load("/content/gdrive/MyDrive/NLP_TripleM/ccGigaFida/words.npy")
words

array(['leto', 'dan', 'konec', 'svet', 'stran', 'mesto', 'šola', 'ura',
       'beseda', 'pot', 'red', 'zakon', 'zadeva', 'srce', 'tema',
       'resnica', 'moški', 'vloga', 'kraj', 'stanje', 'škoda', 'film',
       'večer', 'vrh', 'jutro', 'kazen', 'oblast', 'račun', 'novica',
       'milijon', 'par', 'krog', 'tip', 'punca', 'sila', 'vir', 'las',
       'akcija', 'meter', 'prst', 'kri', 'stik', 'grad', 'znak', 'lik',
       'direktor', 'vodja', 'raven', 'kolo', 'rob', 'gost', 'duh',
       'praznik', 'vest', 'korist', 'vedenje', 'tek', 'kup', 'otok',
       'razstava', 'bitje', 'motor', 'karta', 'nevarnost', 'hitrost',
       'kos', 'zob', 'stroj', 'kamen', 'župan', 'šef', 'vrtec', 'kot',
       'deček', 'avgust', 'tok', 'jezero', 'klop', 'čelo', 'hip', 'kupec',
       'pojav', 'čaj', 'postava', 'dolg', 'standard', 'jesen', 'rak',
       'grob', 'plus', 'les', 'vez', 'polica', 'minus', 'plan', 'posoda',
       'restavracija', 'jok', 'krilo', 'sol', 'rod', 'stres', 'trditev',
       'f

In [None]:
min_number_of_words=8
for keyword in ['golf']:
    all_sentences2=data[keyword][:3000] 
    all_sentences_lema2=data_lema[keyword][:3000] 

    all_sentences = []
    all_sentences_lema = []
    for sentence, sentence_lema in zip(all_sentences2, all_sentences_lema2):
        if len(sentence) >= min_number_of_words and sentence_lema not in all_sentences_lema:
            all_sentences.append(sentence)
            all_sentences_lema.append(sentence_lema)
    
    #all_sentences = convert_to_lowercase(all_sentences)
    all_sentences = convert_to_lowercase(all_sentences)
    all_embeddings=np.zeros((len(all_sentences),1024))
    
    #character_ids = batch_to_ids(all_sentences)
    #embeddings = elmo(character_ids) #rip RAM
    
    for i in range(len(all_sentences)): #iterate through the sentences for the given keyword
        character_ids = batch_to_ids([all_sentences[i]])
        embeddings = elmo(character_ids)

        keyword_position = all_sentences_lema[i].index(keyword)
        
        all_embeddings[i,:]=embeddings['elmo_representations'][0].detach().numpy()[0][keyword_position]
        #IF YOU FEED ALL SENTENCE USE BELLOW
        #all_embeddings[i,:]=embeddings['elmo_representations'][0].detach().numpy()[i][keyword_position]
        
    
    break #this break means that we terminate on the first word
    
del embeddings 

## Calculate pairwise similarities

In [None]:
#The bigger it is the more similar the senctences 
similarities={}
for i in range(all_embeddings.shape[0]):
    for j in range(i+1,all_embeddings.shape[0],1):
        similarities[str(i)+'-'+str(j)] = cos_similarity(all_embeddings[i,:], all_embeddings[j,:])
        
similarities = dict(sorted(similarities.items(), key=lambda x:x[1]))

distance_matrix=(pairwise_distances(all_embeddings,metric="cosine"))
similarity_matrix=1-distance_matrix

## Best in worst scores

In [None]:
similarities_keys = list(similarities.keys())
for key in similarities_keys[:10]:
    first_sentence_idx, second_sentence_idx = map(int, key.split('-'))
    first_sent, second_sent = ' '.join(all_sentences[first_sentence_idx]), ' '.join(all_sentences[second_sentence_idx])
    
    print(similarities[key])
    print("1)"+first_sent+'.\n2)'+second_sent+".")
    print('-----')

0.1325801865482292
1)enega od golfov iz omejene serije teh ljudskih športnikov smo preizkusili tudi mi.
2)zaradi opisane ameriške logike je znanost prenehala biti posvečen azil elitni intelektualni golf klub in je postala enako negotova in kruta tekma kot ameriška hokejska liga znanstvenik se v svoji ligi obdrži le toliko časa dokler dosega kompetitivne rezultate.
-----
0.14564062047174417
1)enega od golfov iz omejene serije teh ljudskih športnikov smo preizkusili tudi mi.
2)9.00 izven kontrole 9.30 ameriški gladiatorji 11.15 stoke 11.45 normal oddaja za invalidne osebe 12.15 nogomet 2. nemška zvezna liga 13.00 world soccer 13.30 šport moči 14.30 motobike 15.30 motorvision 16.30 warm up 18.00 namizni tenis 1. nemška zvezna liga 19.00 golf us masters 20.00 gillette world sport special 20.30 biljard 21.30 knockout magazin 22.15 wrestling spw 0.00 redki športi 0.30 redki športi 1.00 best direct.
-----
0.14946752852991377
1)predsednik prvega celjskega golf kluba je borut sedovnik.
2)morda 

## Agglomerative clustering

In [None]:
# ‘complete’, ‘average’, ‘single’
clusters = AgglomerativeClustering(affinity='precomputed', linkage='single', n_clusters=2).fit(distance_matrix)
print(Counter(clusters.labels_))
# ‘complete’, ‘average’, ‘single’
clusters = AgglomerativeClustering(affinity='precomputed', linkage='average', n_clusters=2).fit(distance_matrix)
print(Counter(clusters.labels_))
# ‘complete’, ‘average’, ‘single’
clusters = AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=2).fit(distance_matrix)
print(Counter(clusters.labels_))

Counter({0: 2063, 1: 1})
Counter({0: 2013, 1: 51})
Counter({1: 1271, 0: 793})


In [None]:
import pandas as pd
our_word = "klop"
dataset = np.hstack((all_embeddings, np.transpose([clusters.labels_])))
columns = list(range(dataset.shape[1]))
columns[-1] = "y"
df = pd.DataFrame(dataset, columns = columns)
df.to_csv(f"/content/gdrive/MyDrive/NLP_TripleM/for_classification/{our_word}_hierarhical_ELMO.csv", index = False)

In [None]:
for i,label in enumerate(clusters.labels_):
    if label==0:
        print(i,':', ' '.join(all_sentences[i]))

2 : na zatožni klopi se mora braniti zaradi spolne zlorabe mladoletnika
3 : po njegovem prepričanju želijo ustvariti nekakšno ravnovesje potem ko so na zatožno klop že spravili miloševića in milutinovića
4 : profesor sedi spredaj poleg voznika tovarišica inge se brezskrbno stiska z mundyjem na zadnji klopi
5 : vi pa mi že nehajte delati kurčevo senco reče in kljubovalno obsedi na klopi dokler ga profesor s saševo pomočjo ne dvigne na noge in odvede v avto
6 : kupi časopis sede na klop in opazuje svet ki hiti mimo prešine ga ali tudi svet opazuje njega
7 : zadnja klop pri audiju a3 seveda ni tako udobna in prostorna kot v osmici
8 : prtljažnik pa je za ta golfov razred pričakovano velik in meri 350 litrov pri podrti zadnji klopi pa dobite 1100 litrski prostor
9 : zadnje minute so minile v splošni norišnici na tribunah in na slovenski klopi v 13. medsebojni tekmi pa je slovenija sinoči dosegla četrto zmago
10 : in slabo uslugo bi si naredili če bi se začeli v vladnih klopeh ozirati po do

In [None]:
for sent in all_sentences[-10:]:
    print( ' '.join(sent)) 

tam je gol stal carlin mož in divje masturbiral pripoveduje mark
discipline gospodar ni vsiljeval toliko z golo silo kot s svojo karizmo skupnostni obedi in obredi so bili sestanki posvetovalnih teles glavni vzvod discipliniranja pa so bile ceremonije 30
3 goli larsson švedska tomasson danska lampard anglija zidane francija
po golu gutalja je zadel še škaper ki je čez slabih deset minut povedel domače v vodstvo s 3:0
gole zametke v fazi nastajanja ki so samo pomotoma javni
po prvem domačem golu zabil ga je brian mcbride je sodnik razveljavil njihov gol in dvakrat ni pokazal na belo točko ko bi lahko nakar je na drugi strani pape bouba diop v 35. minuti zabil odločilni drugi gol
čeprav so domači rokometaši prvo tekmo izgubili za devet golov je bila dvorana v ormožu premajhna za vse ljubitelje rokometa
drugi gol so potem sicer pripisali traubu kot strelcu avtogola kar niti ni tako pomembno
s 17 ali 18 goli je lavrič sezono končal tudi reprezentančno kot drugi najboljši strelec druge nemš

In [None]:
len(all_sentences)

2673