In [80]:
import zipfile
import gensim
import numpy as np
import json
import random
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralClustering

def cos_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

## Loading word2vec skipgram model

In [3]:
repository = "../../"
with zipfile.ZipFile(repository + "/67.zip", "r") as archive:
    stream = archive.open("model.txt")
    model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=False, unicode_errors='replace')

In [4]:
#testing
result = model.similar_by_word("legenda")
most_similar_key, similarity = result[0]  
print(f"{most_similar_key}: {similarity:.4f}")

picokih: 0.6543


## Loading all sentences

In [5]:
with open('../ccGigaFida/results/data.json') as json_file:
    data = json.load(json_file)
with open('../ccGigaFida/results/data_lema.json') as json_file:
    data_lema = json.load(json_file)
with open('../ccGigaFida/results/data_pos.json') as json_file:
    data_len = json.load(json_file)

words = np.load("../ccGigaFida/words.npy")

In [44]:
for keyword in words:
    all_sentences=data[keyword][0:1000]
    all_sentences_lema=data_lema[keyword][0:1000]
    all_embeddings=np.zeros((len(all_sentences),100))
    
    for i in range(len(all_sentences)): #iterate through the sentences for the given keyword
        keyword_position = all_sentences_lema[i].index(keyword)
        #all_sentences[i].pop(keyword_position)
        
        centroid=np.zeros(100)
        words_added=0
        
        for word_index, word in enumerate(all_sentences[i]): #iterate through the words in the sentence
            try:
                if word_index!=keyword_position:
                    centroid+=model[word.lower()] 
                    words_added+=1
            except:
                ...
                
        if words_added==0: #there are wierd sentences where only the keyword is the whole sentence
            continue
            
        centroid/=words_added
        all_embeddings[i,:]=centroid
        
    
    break

In [45]:
distances={}
for i in range(all_embeddings.shape[0]):
    for j in range(i+1,all_embeddings.shape[0],1):
        distances[str(i)+'-'+str(j)] = cos_similarity(all_embeddings[i,:], all_embeddings[j,:])

In [46]:
distances = dict(sorted(distances.items(), key=lambda item: item[1]))

## Analyze worst 50 distances

In [47]:
distances_keys = list(distances.keys())
for key in distances_keys[0:50]:
    first, second = map(int, key.split('-'))
    print(' '.join(all_sentences[first]))
    print(' '.join(all_sentences[second]))
    print('-----')

Ivova podaja Elvisov gol
Gola oglašujeta parfum
-----
Mimo gola
Gola oglašujeta parfum
-----
Golo 35 Škofljica
Gola oglašujeta parfum
-----
Ivova podaja Elvisov gol
Goli poster
-----
Trebnje za gol
Gola oglašujeta parfum
-----
Mimo gola
Goli poster
-----
Trojke in goli
Gola oglašujeta parfum
-----
Ivova podaja Elvisov gol
Fleha Gola muca
-----
Ivova podaja Elvisov gol
Fleha Gola muca
-----
Ivova podaja Elvisov gol
goli kuhar
-----
Mimo gola
goli kuhar
-----
Ivova podaja Elvisov gol
Goli in bosi
-----
Ivova podaja Elvisov gol
Brad Pitt v Fincherjevem Klubu golih pesti
-----
Trije zaporedni porazi ob gol razliki 2:10
Gola oglašujeta parfum
-----
Golo 35 Škofljica
Goli poster
-----
Ivova podaja Elvisov gol
Oropali gostilno očeta golega kuharja
-----
Gola razsipnost
Goli poster
-----
Merc Dušan Golo mesto
Goli poster
-----
Streli na gol 1
Gola oglašujeta parfum
-----
goli podredno zloženi
Gola oglašujeta parfum
-----
Merc Dušan Golo mesto
Gola oglašujeta parfum
-----
Gola razsipnost
Gola o

## Analyze best 50 distances

In [49]:
distances_keys = list(distances.keys())
for key in distances_keys[-50:]:
    first, second = map(int, key.split('-'))
    #print(first, second)
    print(' '.join(all_sentences[first]))
    print(' '.join(all_sentences[second]))
    print('-----')

Freier si je zlomil nos že v 5. minuti tekme ki jo je moštvo z Anfield Roada po dveh golih Djibrila Cisseja in enem Milana Baroša dobilo s 3:0
Zgolj remi zato ker so zebre v Münchnu na Allianz Areni se je zbralo 30 tisoč gledalcev po osmih minutah vodile že z 2:0 Trboveljčan je bil strelec drugega gola ko je izkoristil napako domače obrambe prvega je v 6. minuti iz enajstmetrovke dosegel Maročan Youssef Mokhtari
-----
Ko je ob zanesljivem izvajanju sedemmetrovk Romunke Gubescheve in hitrih protinapadih prek Peterlinove prednost narasla na pet golov 10:5 v 17. minuti pa celo na šest golov 13:7 je bilo že jasno da so Zagorjanke blizu uspeha
Po vodstvo Belorusov s tremi zadetki razlike so se sicer približali na dva gola zaostanka z zadetkom Tičarja ob igralcu manj na ledu toda ko so ob kazni Šelega poskusili brez vratarja so dobili gol na prazno mrežo in vsega je bilo konec
-----
Goli domači 38 gostje 51 skupaj 89 redni del 73 kazenski streli 16 najboljši strelci Horvat 4 Delamea Bojan Li

## Construct similarity distance matrix

In [73]:
#print(all_embeddings.shape)
distance_matrix=(pairwise_distances(all_embeddings,metric="cosine"))
distance_matrix

array([[0.        , 0.13731799, 0.22660908, ..., 0.32688616, 0.24738726,
        0.27633132],
       [0.13731799, 0.        , 0.12635066, ..., 0.28019894, 0.18324466,
        0.17706561],
       [0.22660908, 0.12635066, 0.        , ..., 0.23151193, 0.20322849,
        0.1742427 ],
       ...,
       [0.32688616, 0.28019894, 0.23151193, ..., 0.        , 0.32300266,
        0.32060998],
       [0.24738726, 0.18324466, 0.20322849, ..., 0.32300266, 0.        ,
        0.29758845],
       [0.27633132, 0.17706561, 0.1742427 , ..., 0.32060998, 0.29758845,
        0.        ]])

## Clustering

In [81]:
clusters = AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=2).fit(distance_matrix)
clusters.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [82]:
SpectralClustering(2).fit_predict(distance_matrix)



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,