# Word2Vec approach
## Model will be trained on whole docs text, plus some 'reinforced' docs containing only entities; prediction phase will be tested on entities-only documents

In [1]:
# needed libraries
import json
import random
#import numpy as np
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Word2Vec
import  gensim
from collections import Counter
from sklearn.cluster import DBSCAN

In [13]:
## Load documents from json
filename = 'clean_dataset.json'
with open(filename, 'r') as out:
        docs = json.load(out)
        
print("File loaded correctly, type: ", type(docs), len(docs))
# we need a single string instead of a list in result_entities
"""
for doc in docs:
    if isinstance(doc['result_entities'], list):
        for word in doc['result_entities']:
            word_sum = word_sum + word + ' '
        doc['result_entities'] = word_sum"""
print(docs[0])

File loaded correctly, type:  <class 'list'> 1569
{'fonte_dati': ['trend_analisys'], 'id': 'https://www.punto-informatico.it/fujitsu-si-separa-da-pc-e-mobile/', 'ta_id': [5], 'title': 'Fujitsu si separa da PC e mobile', 'abstract': '   Roma – Per guadagnare in efficienza e tentare di rincorrere una posizione più appetibile sul mercato mobile e sul mercato del PC, per affrontare anni di profondi cambiamenti per entrambi i settori, Fujitsu  ha annunciato  lo spinoff delle due divisioni dedicate l’una a notebook e PC e l’altra agli smartphone. \n Le due aziende, che nasceranno ufficialmente nel mese di febbraio del prossimo anno, consentiranno all’azienda “di chiarire le responsabilità nella gestione, di agevolare decisioni più rapide della dirigenza e di ottenere una maggiore efficienza”: aspetti fondamentali nel momento in cui la diffusione sempre più di massa e sempre più ubiqua di PC e smartphone “ha reso progressivamente sempre più difficile differenziarsi e ha reso sempre più serrat

# Create and train model
## experiment: try to insert some entities-only docs in training corpus

In [22]:
import random

# the effect I want to create by adding entities only docs is to 'pull' vectors towards meaningful words 
# in a doc, without losing the standard context they appear into

train_corpus = [gensim.utils.simple_preprocess(doc['title'] + doc['abstract']) for doc in docs]
# no need to pre-process entities, just make-sure they're lower-cased
        
train_corpus = train_corpus + [doc['result_entities'] for doc in docs]

random.shuffle(train_corpus)


import multiprocessing

cores = multiprocessing.cpu_count()


# sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
# negative (int, optional) – If > 0, negative sampling will be used, 
# the int for negative specifies how many “noise words” should be drawn (usually between 5-20).
# every now and then we select a word and we ignore it by treating it as noise

epochs = 30
vec_size = 100
entities_alpha = 0.10  
abstract_alpha = 0.05 # here we have much more data
MODEL_NAME = 'TestModels/w2v_entities+abstract_model.model'

# let's introduce a higher min_count here, since we have a sufficient number of data

# Skip-gram
model = Word2Vec(size=vec_size, negative=5, hs=0, min_count=5, sample=0, 
        iter=epochs, workers=cores, sg = 1)


# build our vocabulary of words (all the unique words encountered inside our corpus, needed for training)
model.build_vocab(train_corpus)
print("Vocabulary created, number of known words: ", len(model.wv.vocab))

# train the models on the given data!

print("Training %s" % model)
%time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.iter)
model.save(MODEL_NAME)

print("Model Saved.")


Vocabulary created, number of known words:  11155
Training Word2Vec(vocab=11155, size=100, alpha=0.025)


  """Entry point for launching an IPython kernel.


CPU times: user 4min 10s, sys: 292 ms, total: 4min 11s
Wall time: 1min 7s
Model Saved.


In [2]:
# load model 
MODEL_NAME = 'TestModels/w2v_entities+abstract_model.model'
model = Word2Vec.load(MODEL_NAME)

print(model.)

# Function to represent a doc given its entities

In [None]:
def infer_vector():
    """Given a list of entities, returns the vector representing the documents from which the entities 
    were extracted from, wrt a given W2V model."""
    
    