# Word2Vec approach
## Model will be trained on whole docs text, plus some 'reinforced' docs containing only entities; prediction phase will be tested on entities-only documents

In [50]:
# needed libraries
import json
import random
#import numpy as np
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Word2Vec
import gensim
from collections import Counter
from sklearn.cluster import DBSCAN
import numpy as np

In [51]:
## Load documents from json
filename = 'clean_dataset.json'
with open(filename, 'r') as out:
        docs = json.load(out)
        
print("File loaded correctly, type: ", type(docs), len(docs))
# we need a single string instead of a list in result_entities
"""
for doc in docs:
    if isinstance(doc['result_entities'], list):
        for word in doc['result_entities']:
            word_sum = word_sum + word + ' '
        doc['result_entities'] = word_sum"""
print(docs[0])

File loaded correctly, type:  <class 'list'> 1569
{'fonte_dati': ['trend_analisys'], 'id': 'https://www.punto-informatico.it/fujitsu-si-separa-da-pc-e-mobile/', 'ta_id': [5], 'title': 'Fujitsu si separa da PC e mobile', 'abstract': '   Roma – Per guadagnare in efficienza e tentare di rincorrere una posizione più appetibile sul mercato mobile e sul mercato del PC, per affrontare anni di profondi cambiamenti per entrambi i settori, Fujitsu  ha annunciato  lo spinoff delle due divisioni dedicate l’una a notebook e PC e l’altra agli smartphone. \n Le due aziende, che nasceranno ufficialmente nel mese di febbraio del prossimo anno, consentiranno all’azienda “di chiarire le responsabilità nella gestione, di agevolare decisioni più rapide della dirigenza e di ottenere una maggiore efficienza”: aspetti fondamentali nel momento in cui la diffusione sempre più di massa e sempre più ubiqua di PC e smartphone “ha reso progressivamente sempre più difficile differenziarsi e ha reso sempre più serrat

# Create and train model
## experiment: try to insert some entities-only docs in training corpus

In [59]:
import random
def lower_case_list(list_):
    for i, word in enumerate(list_):
        list_[i] = word.lower()
    return list_
# the effect I want to create by adding entities only docs is to 'pull' vectors towards meaningful words 
# in a doc, without losing the standard context they appear into

# lower case training corpus too, so we don't have differences between this and entities
train_corpus = [gensim.utils.simple_preprocess(doc['title'].lower() + doc['abstract'].lower()) for doc in docs]
# no need to pre-process entities, just make-sure they're lower-cased
        
train_corpus = train_corpus + [lower_case_list(doc['result_entities']) for doc in docs]

random.shuffle(train_corpus)


import multiprocessing

cores = multiprocessing.cpu_count()


# sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
# negative (int, optional) – If > 0, negative sampling will be used, 
# the int for negative specifies how many “noise words” should be drawn (usually between 5-20).
# every now and then we select a word and we ignore it by treating it as noise

epochs = 30
vec_size = 100
entities_alpha = 0.10  
abstract_alpha = 0.05 # here we have much more data
MODEL_NAME = 'TestModels/w2v_entities+abstract_model.model'

# let's introduce a higher min_count here, since we have a sufficient number of data

# Skip-gram
model = Word2Vec(size=vec_size, negative=5, hs=0, min_count=5, sample=0, 
        iter=epochs, workers=cores, sg = 1)


# build our vocabulary of words (all the unique words encountered inside our corpus, needed for training)
model.build_vocab(train_corpus)
print("Vocabulary created, number of known words: ", len(model.wv.vocab))

# train the models on the given data!

print("Training %s" % model)
%time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.iter)
model.save(MODEL_NAME)

print("Model Saved.")


Vocabulary created, number of known words:  10797
Training Word2Vec(vocab=10797, size=100, alpha=0.025)


  """Entry point for launching an IPython kernel.


CPU times: user 4min 16s, sys: 1.73 s, total: 4min 18s
Wall time: 2min 27s
Model Saved.


In [60]:
# load model 
MODEL_NAME = 'TestModels/w2v_entities+abstract_model.model'
model = Word2Vec.load(MODEL_NAME)
# simple model testing
print(model.most_similar('terra', topn=13))
print(model.most_similar('roma', topn = 13))

[('marte', 0.646645188331604), ('extraterrestre', 0.6415097713470459), ('sistema solare', 0.6046188473701477), ('pianeta', 0.5946462154388428), ('iss', 0.5901428461074829), ('flora', 0.5829756855964661), ('sonda spaziale', 0.5825291872024536), ('marte (astronomia)', 0.580169677734375), ('fiume', 0.579222559928894), ('meteorite', 0.5746831297874451), ('venere', 0.568903386592865), ('rosso', 0.5658887028694153), ('luna', 0.5651825666427612)]
[('torino', 0.6039671897888184), ('emilia', 0.5589956045150757), ('sapienza', 0.5537577867507935), ('reggio', 0.5534363389015198), ('fiumicino', 0.5531430244445801), ('fiera', 0.534198522567749), ('padova', 0.5295882225036621), ('verona', 0.5231437683105469), ('milano', 0.5223691463470459), ('bologna', 0.5136139392852783), ('lazio', 0.5101860761642456), ('piaggio', 0.5095441341400146), ('sicilia', 0.508764386177063)]


  """
  if np.issubdtype(vec.dtype, np.int):
  


# Function to represent a doc given its entities
Average of entities vectors seem to perform better, especially because we can compute the distance between 
two documents defined by the same entities (also, does not depend on their order)

In [19]:
def mean_of_vectors(vectors):
    """given a list of vectors, return the simplest mean of vectors."""
    
    sum_vectors = np.zeros(np.shape(vectors[0]))
    for vec in vectors:
        sum_vectors = sum_vectors + vec
    return sum_vectors/len(vectors)

def infer_vector(entities, model):
    """Given a list of entities, returns the vector representing the documents from which the entities 
    were extracted from, wrt a given W2V model.
    
    entities: list of entities, our way of representing a document.
    model: w2v model.
    """
    
    # get word vector of each entity; ignores word if the model does not know it
    entities_vecs = []
    for e in entities:
        try:
            entities_vecs.append(model[e])
        except:
            None # ignore unknown word
    
    return mean_of_vectors(entities_vecs)
    

In [102]:
# usage test
import sklearn.metrics.pairwise as sk
a = infer_vector(['roma', 'blockchain', 'finanza', 'politica'], model)
b = infer_vector(['terra', 'sole', 'spazio'], model)
#print(model.similarity('milano', 'roma'))
print(sk.cosine_similarity([a], [b]))
a = infer_vector(['marte', 'stella', 'spazio', 'meteora'], model)
print(sk.cosine_similarity([a], [b]))

a = infer_vector(['sole', 'spazio', 'terra'], model)
b = infer_vector(['terra', 'sole', 'spazio'], model)
print(sk.cosine_similarity([a], [b]))

[[0.35007365]]
[[0.81275087]]
[[1.]]


  


# Pairwise distance approach
Instead of defining a way to represent a doc, we define a distance between docs (seen as sets of WordVectors),
very much like it is done in Hierachical Clustering with Group Averaging.
Finding eps value will be easier, but computing this matrix might be costly for a high number of vectors.

In [93]:
a = np.zeros((2, 2)) # how to inizialize a matrix
a[1, 1] = 1
print(a)

import numpy as np
import sklearn.metrics.pairwise as sk # for cosine_distance
# TODO: add possibility of passing metric to use as parameters
def get_pairwise_distances_matrix(docs, model, verbose = False):
    """"
        docs: list of documents, each represented as a list of entities.
        model: w2v model used to fetch the representation of entities as word vectors.
        verbose: print out operations.
        
        Returns the pairwise distances matrix between documents. 
        Distance between 2 docs will be computed by averaging the distances between all words
        composing the 2 documents.
        Metric used is the one used in group_averaging_distance.
    """
    # initialize distance matrix
    n = len(docs)
    distances_m = np.zeros((n, n))
    
    # un-wrap each set of entities (doc) and compute the distance betweem them all
    # this is all but efficient at the moment, okay for a debug version.
    for i, doc1 in enumerate(docs):
        if verbose: print("##Calculating distances from ", doc1)
        for j, doc2 in enumerate(docs):
            distances_m[i, j] = group_averaging_distance(entities_vector(doc1, model), entities_vector(doc2, model))
            if verbose: print("Distance between %s and %s: %s"%(doc1, doc2, distances_m[i, j]))
    return distances_m
    
def group_averaging_distance(doc1, doc2):
    """
        Computes and returns the distance between 2 'sets'/lists of vectors, 
        by computing the distance between a vector in doc1 and all the other in doc2,
        and averaging all these distances.
        Metric used to compute distance is the cosine_distance -by default-.
    """
    sum_of_distances = 0
    for vec1 in doc1:
        for vec2 in doc2:
            sum_of_distances += sk.cosine_distances([vec1], [vec2])
    return sum_of_distances/(len(doc1) * len(doc2))

def entities_vector(doc, model):
    """"
    Doc: document, represented as a list of entities.
    model: w2v model used to fetch representation of each vector.
    
    Given these two arguments, returns a list of vectors, each vector representing 
    an entity word.
    In case the model does NOT know the word in the list, it will be ignored.
    Might return an empty list.
    """
    ## TODO: print-out unknown words!
    list_ = []
    for word in doc:
        try:
            v = model[word]
            list_.append(v)
        except:
            print("Unknown word found!")
    return list_
    

[[0. 0.]
 [0. 1.]]


In [99]:
# test 
a = get_pairwise_distances_matrix([['terra', 'sole', 'spazio'], ['roma', 'blockchain', 'finanza', 'politica']], model, True)
print(a)
a = get_pairwise_distances_matrix([['terra', 'sole', 'spazio'], ['marte', 'stella', 'spazio', 'meteora']], model, True)
print(a)
#print(group_averaging_distance([model['roma'], model['milano']], [model['roma'], model['milano']]))

##Calculating distances from  ['terra', 'sole', 'spazio']
Distance between ['terra', 'sole', 'spazio'] and ['terra', 'sole', 'spazio']: 0.41432106494903564
Distance between ['terra', 'sole', 'spazio'] and ['roma', 'blockchain', 'finanza', 'politica']: 0.8169827461242676
##Calculating distances from  ['roma', 'blockchain', 'finanza', 'politica']
Distance between ['roma', 'blockchain', 'finanza', 'politica'] and ['terra', 'sole', 'spazio']: 0.8169827461242676
Distance between ['roma', 'blockchain', 'finanza', 'politica'] and ['roma', 'blockchain', 'finanza', 'politica']: 0.5215639472007751
[[0.41432106 0.81698275]
 [0.81698275 0.52156395]]
##Calculating distances from  ['terra', 'sole', 'spazio']
Distance between ['terra', 'sole', 'spazio'] and ['terra', 'sole', 'spazio']: 0.41432106494903564
Distance between ['terra', 'sole', 'spazio'] and ['marte', 'stella', 'spazio', 'meteora']: 0.5227131247520447
##Calculating distances from  ['marte', 'stella', 'spazio', 'meteora']
Distance between 

