In [1]:
import zipfile
import gensim
import numpy as np
import json
import random
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import AgglomerativeClustering, SpectralClustering, DBSCAN
from collections import Counter
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
#import classla
#import fasttext.util

In [2]:
#Some utility functions which are used
def cos_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def remove_diplicate_sentences(a):
    b_set = set(map(tuple,a))  #need to convert the inner lists to tuples so they are hashable
    b = list(map(list,b_set)) #Now convert tuples back into lists (maybe unnecessary?)
    return b

## Loading word2vec skipgram model

In [3]:
repository = "../../"
with zipfile.ZipFile(repository + "/67.zip", "r") as archive:
    stream = archive.open("model.txt")
    model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=False, unicode_errors='replace')

In [4]:
#testing
result = model.similar_by_word("legenda")
most_similar_key, similarity = result[0]  
print(f"{most_similar_key}: {similarity:.4f}")

picokih: 0.6543


## Loading all sentences

In [None]:
with open('../ccGigaFida/results/data.json') as json_file:
    data = json.load(json_file)
with open('../ccGigaFida/results/data_lema.json') as json_file:
    data_lema = json.load(json_file)
with open('../ccGigaFida/results/data_pos.json') as json_file:
    data_len = json.load(json_file)

words = np.load("../ccGigaFida/words.npy")
words

In [68]:
for keyword in ['gol']:
    all_sentences=data[keyword][0:1000]
    all_sentences_lema=data_lema[keyword][0:1000]
    
    #Remove duplicate sentences if any
    #all_sentences=remove_diplicate_sentences(all_sentences)
    #all_sentences_lema=remove_diplicate_sentences(all_sentences_lema)
    
    all_embeddings=np.zeros((len(all_sentences),100))
    
    for i in range(len(all_sentences)): #iterate through the sentences for the given keyword
        keyword_position = all_sentences_lema[i].index(keyword)
        #all_sentences[i].pop(keyword_position)
        
        centroid=np.zeros(100)
        words_added=0
        
        for word_index, word in enumerate(all_sentences[i]): #iterate through the words in the sentence
            try:
                if word_index!=keyword_position:
                    centroid+=model[word.lower()] 
                    words_added+=1
            except:
                ...
                
        if words_added==0: #there are wierd sentences where only the keyword is the whole sentence
            continue
            
        centroid/=words_added
        all_embeddings[i,:]=centroid
        
    
    break #this break means that we terminate on the first word
    
#~np.all(all_embeddings == 0, axis=1) this checks if there are any only-zero rows meaning that we did not have any word embedding for that sentence

In [69]:
distances={}
for i in range(all_embeddings.shape[0]):
    for j in range(i+1,all_embeddings.shape[0],1):
        distances[str(i)+'-'+str(j)] = cos_similarity(all_embeddings[i,:], all_embeddings[j,:])

## Analyze worst 50 distances

In [None]:
distances_keys = list(distances.keys())
already_were = [] #to je zato ker se stavke ponavljajo in ne zanimajo nas iste kombinacije znova
count=0
for key in distances_keys[0:100]:
    first, second = map(int, key.split('-'))
    first_sent, second_sent = ' '.join(all_sentences[first]), ' '.join(all_sentences[second])
    
    if first_sent+second_sent in already_were:
        continue
    
    count+=1
    already_were.append(first_sent+second_sent)
    print(first_sent+'\n'+second_sent)
    print('-----')
    if count==10:
        break

## Analyze best 50 distances

In [None]:
distances_keys = list(distances.keys())
already_were = [] #to je zato ker se stavke ponavljajo in ne zanimajo nas iste kombinacije znova
count=0
for key in distances_keys[-1000:]:
    first, second = map(int, key.split('-'))
    first_sent, second_sent = ' '.join(all_sentences[first]), ' '.join(all_sentences[second])
    if first_sent+second_sent in already_were:
        continue
    
    count+=1
    already_were.append(first_sent+second_sent)
    print(first_sent+'\n'+second_sent)
    print('-----')
    if count==10:
        break

## Construct similarity distance matrix

In [56]:
#print(all_embeddings.shape)
distance_matrix=(pairwise_distances(all_embeddings,metric="cosine"))
similarity_matrix=1-(pairwise_distances(all_embeddings,metric="cosine"))


#density plot
similarity_matrix_flatten = similarity_matrix.reshape(len(similarity_matrix)**2)
density = gaussian_kde(similarity_matrix_flatten)
density.covariance_factor = lambda : .5
len(similarity_matrix_flatten)
plt.plot(similarity_matrix_flatten, density(similarity_matrix_flatten))
plt.show()


In [75]:
best_distances_indices=[]

best_scores_indices=[]
for i in list(distances.keys()):
    first, second = i.split('-')
    if distances[i]>0.9:
        best_scores_indices.append(first)
        best_scores_indices.append(second)


best_scores_indices=(np.unique(best_scores_indices))
best_scores_indices = best_scores_indices.astype(int)
all_embeddings_selected = all_embeddings[best_scores_indices, :]

distance_matrix=(pairwise_distances(all_embeddings_selected,metric="cosine"))

In [55]:
all_sentences_selected = []
for i, in best_scores_indices:
    all_sentences_selected.append(all_sentences[i])

## Clustering

In [57]:
clusters = AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=2).fit(distance_matrix)
Counter(clusters.labels_)

Counter({0: 291, 1: 427})

In [63]:
for i,label in enumerate(clusters.labels_):
    if label==0:
        print(i,':', ' '.join(all_sentences_selected[i]))

0 : Naši politični emigranti ki so pribežali v Argentino da so si rešili golo življenje so imeli namreč onstran vse organizacijske dovršenosti onstran in globlje od vseh odlik kvalitetnega javnega dela onstran in globlje pod vsem tem so imeli izvir gibalo in vzrok svojega poslanstva
2 : Slečejo ga do golega in ga od glave do peta premažejo z grafitno mastjo to naj bi zmedlo policijske pse
3 : Po tem nosu sem jo velikokrat skupil še zlasti na Golem otoku zdaj pa najhuje ker ni več simetričen
10 : Antinori pa bo verjetno želel več od golega zagotovila da bo ženska ki nosi v sebi klon zarodek smela splaviti
12 : Z enako potezo se je izkazal še v sodnikovem podaljšku ko je najprej Ipavec zgrešil prazen gol nato pa Mejač iz neposredne bližine žoge ni usmeril mimo najboljšega igralca tekme
13 : Obenem je Stajić tudi strelec edinega gola za Publikum od tistih igralcev ki jih ima Pocrnjič na voljo
14 : Najboljša obramba Angleži so prejeli le en gol proti najboljšemu napadu Brazilci so jih dose

In [27]:
spectral = SpectralClustering(2).fit_predict(distance_matrix)
Counter(spectral)



Counter({0: 998, 1: 2})

In [31]:
dbscan = DBSCAN(metric='cosine', eps=0.4, min_samples=3).fit(all_embeddings)  # you can change these parameters, given just for example 
labels = dbscan.labels_ # where X - is your matrix, where each row corresponds to one document (line) from the docs, you need to cluster 
#cluster_labels
no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)

print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)

print(Counter(labels))
#all_sentences[np.where(labels == 1)]
for i,label in enumerate(labels):
    if label==1:
        print(all_sentences[i])

Estimated no. of clusters: 2
Estimated no. of noise points: 3
Counter({0: 997, -1: 3})
