In [1]:
import zipfile
import gensim
import numpy as np
import json
import random
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import AgglomerativeClustering, SpectralClustering, DBSCAN
from collections import Counter
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
#import classla
#import fasttext.util

In [2]:
#Some utility functions which are used
def cos_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def remove_diplicate_sentences(a):
    b_set = set(map(tuple,a))  #need to convert the inner lists to tuples so they are hashable
    b = list(map(list,b_set)) #Now convert tuples back into lists (maybe unnecessary?)
    return b

## Loading word2vec skipgram model

In [3]:
repository = "../../"
with zipfile.ZipFile(repository + "/67.zip", "r") as archive:
    stream = archive.open("model.txt")
    model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=False, unicode_errors='replace')

In [4]:
#testing
result = model.similar_by_word("legenda")
most_similar_key, similarity = result[0]  
print(f"{most_similar_key}: {similarity:.4f}")

picokih: 0.6543


## Loading all sentences

In [6]:
with open('../ccGigaFida/results/data.json') as json_file:
    data = json.load(json_file)
with open('../ccGigaFida/results/data_lema.json') as json_file:
    data_lema = json.load(json_file)
with open('../ccGigaFida/results/data_pos.json') as json_file:
    data_len = json.load(json_file)

words = np.load("../ccGigaFida/words.npy")
words

array(['leto', 'dan', 'konec', 'svet', 'stran', 'mesto', 'šola', 'ura',
       'beseda', 'pot', 'red', 'zakon', 'zadeva', 'srce', 'tema',
       'resnica', 'moški', 'vloga', 'kraj', 'stanje', 'škoda', 'film',
       'večer', 'vrh', 'jutro', 'kazen', 'oblast', 'račun', 'novica',
       'milijon', 'par', 'krog', 'tip', 'punca', 'sila', 'vir', 'las',
       'akcija', 'meter', 'prst', 'kri', 'stik', 'grad', 'znak', 'lik',
       'direktor', 'vodja', 'raven', 'kolo', 'rob', 'gost', 'duh',
       'praznik', 'vest', 'korist', 'vedenje', 'tek', 'kup', 'otok',
       'razstava', 'bitje', 'motor', 'karta', 'nevarnost', 'hitrost',
       'kos', 'zob', 'stroj', 'kamen', 'župan', 'šef', 'vrtec', 'kot',
       'deček', 'avgust', 'tok', 'jezero', 'klop', 'čelo', 'hip', 'kupec',
       'pojav', 'čaj', 'postava', 'dolg', 'standard', 'jesen', 'rak',
       'grob', 'plus', 'les', 'vez', 'polica', 'minus', 'plan', 'posoda',
       'restavracija', 'jok', 'krilo', 'sol', 'rod', 'stres', 'trditev',
       'f

## Obtain all embeddings

In [7]:
number_of_sentences_to_use = 3000
min_number_of_words = 8
min_neighbor_distance = 6

In [255]:
for keyword in ['gol']:
    all_sentences2=data[keyword][:min(number_of_sentences_to_use, len(data[keyword]))]
    all_sentences_lema2=data_lema[keyword][:min(number_of_sentences_to_use, len(data[keyword]))]

    all_sentences = []
    all_sentences_lema = []
    for sentence, sentence_lema in zip(all_sentences2, all_sentences_lema2):
        if len(sentence) >= min_number_of_words and sentence_lema not in all_sentences_lema:
            all_sentences.append(sentence)
            all_sentences_lema.append(sentence_lema)
    
    #Remove duplicate sentences if any
    #all_sentences=remove_diplicate_sentences(all_sentences)
    #all_sentences_lema=remove_diplicate_sentences(all_sentences_lema)
    
    all_embeddings=np.zeros((len(all_sentences),100))
    
    for i in range(len(all_sentences)): #iterate through the sentences for the given keyword
        keyword_position = all_sentences_lema[i].index(keyword)
        #all_sentences[i].pop(keyword_position)
        
        centroid=np.zeros(100)
        words_added=0
        
        for word_index, word in enumerate(all_sentences[i]): #iterate through the words in the sentence
            try:
                if word_index!=keyword_position and abs(word_index - keyword_position) < min_neighbor_distance:
                    centroid+=model[word.lower()] 
                    words_added+=1
            except:
                ...
                
        if words_added==0: #there are wierd sentences where only the keyword is the whole sentence
            continue
            
        centroid/=words_added
        all_embeddings[i,:]=centroid
        
    
    break #this break means that we terminate on the first word
    
#~np.all(all_embeddings == 0, axis=1) #this checks if there are any only-zero rows meaning that we did not have any word embedding for that sentence

In [256]:
#Distances actually is similarity (its a typo), the bigger it is the more similar the senctences 
distances={}
for i in range(all_embeddings.shape[0]):
    for j in range(i+1,all_embeddings.shape[0],1):
        distances[str(i)+'-'+str(j)] = cos_similarity(all_embeddings[i,:], all_embeddings[j,:])

In [257]:
distances = dict(sorted(distances.items(), key=lambda x:x[1]))

## Analyze worst 50 distances

In [259]:
distances_keys = list(distances.keys())
already_were = [] #to je zato ker se stavke ponavljajo in ne zanimajo nas iste kombinacije znova
count=0
for key in distances_keys[0:5]:
    first, second = map(int, key.split('-'))
    if len(all_sentences[first])<4 or len(all_sentences[second])<4:
        continue
        
    first_sent, second_sent = ' '.join(all_sentences[first]), ' '.join(all_sentences[second])
    
    
    if first_sent+second_sent in already_were:
        continue
    
    count+=1
    already_were.append(first_sent+second_sent)
    print(distances[key])
    print("1)"+first_sent+'\n2)'+second_sent)
    print('-----')
    if count==20:
        break

0.21256955178948644
1)Takrat se je namreč srečanje končalo brez zadetkov obe moštvi pa sta se osredotočili predvsem na razbijanje nasprotnikovih napadov in branjenje svojega gola
2)MEDVODE GOLO BRDO 170 m2 na parceli 955 m2 l. 1990 mirno naselje možnost treh stanovanj
-----
0.21750516240049972
1)V DP je odigral 621 tekem dosegel 332 golov in 343 podaj
2)Gole veje brez listja izgledajo kot korenine in Britanci so zanj uporabili izraz upside down od zgoraj navzdol
-----
0.22010979275954332
1)Takrat se je namreč srečanje končalo brez zadetkov obe moštvi pa sta se osredotočili predvsem na razbijanje nasprotnikovih napadov in branjenje svojega gola
2)Tel. 07 497 5021 Prodam motokultivator frezo Gol doni Labin diesel 14 ks s priključki malo rabljen cena po ogledu in do govoru
-----
0.2306674055903341
1)10.20 Zmenkarije 10.50 Komedija zmešnjav 11.20 Odklop 12.40 Spet zaljubljena nan. 13.30 Mladi in nemirni nad. 14.20 Obala ljubezni nad. 15.40 Varuhi luke nan. 16.35 Korak za korakom nan. 17.05

## Analyze best 50 distances

In [260]:
distances_keys = list(distances.keys())
already_were = [] #to je zato ker se stavke ponavljajo in ne zanimajo nas iste kombinacije znova
count=0
for key in distances_keys[-3:]:
    first, second = map(int, key.split('-'))
    first_sent, second_sent = ' '.join(all_sentences[first]), ' '.join(all_sentences[second])
    if first_sent+second_sent in already_were:
        continue
    
    count+=1
    already_were.append(first_sent+second_sent)
    print(distances[key])
    print("1)"+first_sent+'\n2)'+second_sent)
    print('-----')
    if count==20:
        break

0.9843211302518803
1)Edini gol je v 66. minuti dosegel Jugoslovan Predrag Mijatović
2)Edini gol je v 59. minuti dosegel Milan Purović rdeče beli so v 45. minuti zastreljali tudi najstrožjo kazen Koroman
-----
0.9880944053680162
1)Odločilni gol je v 48. minuti dosegel Oleg Saprikin iz navidez nenevarne akcije
2)Odločilni gol je v 72. minuti dosegel Castillo
-----
0.9946202709127231
1)Goli domači 38 gostje 51 skupaj 89 redni del 73 kazenski streli 16 najboljši strelci Horvat 4 Delamea Bojan Likar Mehič Dejan Simič Smiljanič 3 kartoni rumeni 26 rdeča 2 skupaj 28
2)Goli domači 41 gostje 36 skupaj 77 kartoni rumeni 24 rdeči 4 skupaj 28 najboljši strelci Horvat 6 Šnofl 5 Kristan Mihelič Stres 4 Kurtič Boštjan Uršič Vojsk 3
-----


## Construct similarity distance matrix

In [166]:
#print(all_embeddings.shape)
distance_matrix=(pairwise_distances(all_embeddings,metric="cosine"))
similarity_matrix=1-(pairwise_distances(all_embeddings,metric="cosine"))


#density plot
similarity_matrix_flatten = similarity_matrix.reshape(len(similarity_matrix)**2)
density = gaussian_kde(similarity_matrix_flatten)
density.covariance_factor = lambda : .5
len(similarity_matrix_flatten)
#plt.plot(similarity_matrix_flatten, density(similarity_matrix_flatten))
#plt.show()

5022081

In [211]:
best_scores_indices=[]
for i in list(distances.keys()):
    first, second = i.split('-')
    if distances[i]>0.90:
        best_scores_indices.append(first)
        best_scores_indices.append(second)


best_scores_indices=(np.unique(best_scores_indices))
best_scores_indices = best_scores_indices.astype(int)
all_embeddings_selected = all_embeddings[best_scores_indices, :]

distance_matrix_selected=(pairwise_distances(all_embeddings_selected,metric="cosine"))
len(all_embeddings_selected)

417

In [212]:
all_sentences_selected = []
for i in best_scores_indices:
    all_sentences_selected.append(all_sentences[i])

# Clustering

 ## 1. Agglomerative Clustering

In [242]:
# ‘complete’, ‘average’, ‘single’
clusters = AgglomerativeClustering(affinity='precomputed', linkage='single', n_clusters=2).fit(distance_matrix_selected)
print(Counter(clusters.labels_))
# ‘complete’, ‘average’, ‘single’
clusters = AgglomerativeClustering(affinity='precomputed', linkage='average', n_clusters=2).fit(distance_matrix_selected)
print(Counter(clusters.labels_))
# ‘complete’, ‘average’, ‘single’
clusters = AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=3).fit(distance_matrix_selected)
print(Counter(clusters.labels_))

Counter({0: 415, 1: 2})
Counter({0: 293, 1: 124})
Counter({0: 224, 1: 112, 2: 81})


In [239]:
for i,label in enumerate(clusters.labels_):
    if label==2:
        print(i,':', ' '.join(all_sentences_selected[i]))

7 : 9.00 Golf evropska turneja 9.30 Reklamna oddaja 10.00 Confederations Cup 12.00 Reklamna oddaja 12.30 Confederations Cup 14.30 NHL Stanley Cup 15.15 Pago Pago 16.00 Xapatan 16.30 Takeshi's Castle 17.00 Pago Pago 17.45 Košarka 18.00 DSF novinarski center 18.30 DSF reportaža 19.00 Dvoboj DSF kviz 19.40 DSF novinarski center 20.00 U 21 22.00 DSF novinarski center 22.15 Confederations Cup 23.15 Touchdown Gatorade nogometni magazin 23.45 Golf evropska turneja 0.45 Extreme Situations 1.15 Reklamna oddaja 1.45 Reklamna oddaja
17 : 8.30 Golf 9.30 Jadranje 10.00 Nogomet Prijateljska tekma 11.30 Avtomoto šport 12.30 Motociklizem VN Češke Brno 13.00 Motociklizem VN Češke Brno 14.00 Motociklizem VN Češke Brno 15.15 Motociklizem VN Češke Brno 16.30 Tenis WTA turnir Montreal Kanada 18.00 Motociklizem VN Češke Brno 19.00 Tenis WTA turnir Montreal Kanada 22.45 Rally Fia rally za SP Finska 23.00 Novice 23.15 Pikado 1.00 Rally Fia rally za SP Finska 1.15 Novice 1.30 Konec
22 : 8.30 Golf 9.30 Nogomet 

## 2. Spectral Clustering

In [245]:
#spectral = SpectralClustering(2).fit_predict(distance_matrix_selected)
spectral = SpectralClustering(2).fit(all_embeddings_selected)
Counter(spectral.labels_)

Counter({1: 312, 0: 105})

In [246]:
print(Counter(spectral.labels_))
for i,label in enumerate(spectral.labels_):
    if label==0:
        print(i,':', ' '.join(all_sentences_selected[i]))
        
for i,label in enumerate(spectral.labels_):
    if label==1:
        print(i,':', ' '.join(all_sentences_selected[i]))  

Counter({1: 312, 0: 105})
3 : Volkswagen golf 1.4 16V letnik 1998 metalik srebrne barve reg 1 03 88.000 km oprema servo volan centralno zaklepanje radio 2x električna stekla ABS servisna knjiga meglenke kupljen v SLO cena 1.895.000 SIT
4 : Volkswagen golf 1.9 TDI letnik 1999 00 rdeče barve reg. 12 03 61000 km ser. knjiga oprema avtom klima 2 x airbag 2x elek stekla servo centralno zakl. ABS naslon za roko alu platišča meglenke deljiva klop radio kup v SLO cena 2.565.000 SIT
5 : Orciari Design ponuja tudi možnost predelave serijskih sedežev v anatomsko oblikovane športne sedeže za sedaj za avtomobile Porsche 993 996 Boxster Fiat Stilo Punto I in II BMW Z3 Peugeot 206 Toyota Corolla VW Golf III BMW Mini
6 : Volkswagen Golf 1.9 TDI Highline AM 1 02
9 : GOLF letnik 1989 prodam # 031 641 549
11 : S parkirnega prostora na Jesihovem stradonu je bil ukraden osebni avtomobil znamke VW golf 1.9 TDI srebrne barve z oznako na tablicah LJ 43 66R vreden približno 2,8 milijona tolarjev s Kardeljeve p

# DBScan

In [247]:
dbscan = DBSCAN(metric='cosine', eps=0.4, min_samples=3).fit(all_embeddings_selected)  # you can change these parameters, given just for example 
labels = dbscan.labels_ # where X - is your matrix, where each row corresponds to one document (line) from the docs, you need to cluster 
#cluster_labels
no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)

print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)

print(Counter(labels))
#all_sentences[np.where(labels == 1)]
for i,label in enumerate(labels):
    if label==1:
        print(all_sentences_selected[i])

Estimated no. of clusters: 1
Estimated no. of noise points: 0
Counter({0: 417})
