In [1]:
        ### imports
# PCA imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

# Doc2Vec imports
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
import json # to open our data file


## JSON data import

In [4]:

DATA_FILENAME = "dump_solr.json"
DATA_FILENAME2 = "trend_analisys.json"
# open json file
with open(DATA_FILENAME, "r") as json_file:
    json_data = json.load(json_file)

# open 'old' json file
with open(DATA_FILENAME2, "r") as json_file:
    json_data_old = json.load(json_file)

# we're expecting a dictionary now, since our json file is a json object
assert type(json_data) is dict

# we're expecting a list this time, for the way it is formatted 
assert type(json_data_old) is list

In [5]:
## let's now retrieve the meaningful part of the json document
# response{}--->docs[]

docs = json_data['response']['docs']
print("Number of documents in new json: ",len(docs))
print("Number of documents in old json: ",len(json_data_old))

# let's use both data dumps, make a single list

docs = docs + json_data_old
print(len(docs))

# docs from ilpost.it seem to have a strange title format, let's check it out
# 'delete' items by using list comprehension: make a new list containing only valid items

docs = [dictionary for dictionary in docs if not 'ilpost' in dictionary['url'][0]]

#for i, dictionary in enumerate(docs):
#    if 'ilpost' in dictionary['id']:
#        print(dictionary['title'])
#        del(docs[i])
        
print("New length: ", len(docs))
# this should print nothing at all
for dictionary in docs:
    if 'ilpost' in dictionary['url'][0]:
        print(dictionary)

Number of documents in new json:  1377
Number of documents in old json:  293
1670
New length:  1655


## Adjust data format
change some field type from list to string in particular

In [6]:
for i, dictionary in enumerate(docs):
    for field in ['title', 'abstract', 'flattened_entities']:
        if isinstance(dictionary[field], list):
            # re-format data to hold string instead of single-list item
            docs[i][field] = dictionary[field][0]
        
print(docs[:10])

[{'fonte_dati': ['trend_analisys'], 'id': 'https://www.punto-informatico.it/fujitsu-si-separa-da-pc-e-mobile/', 'ta_id': [5], 'title': 'Fujitsu si separa da PC e mobile', 'abstract': '   Roma – Per guadagnare in efficienza e tentare di rincorrere una posizione più appetibile sul mercato mobile e sul mercato del PC, per affrontare anni di profondi cambiamenti per entrambi i settori, Fujitsu  ha annunciato  lo spinoff delle due divisioni dedicate l’una a notebook e PC e l’altra agli smartphone. \n Le due aziende, che nasceranno ufficialmente nel mese di febbraio del prossimo anno, consentiranno all’azienda “di chiarire le responsabilità nella gestione, di agevolare decisioni più rapide della dirigenza e di ottenere una maggiore efficienza”: aspetti fondamentali nel momento in cui la diffusione sempre più di massa e sempre più ubiqua di PC e smartphone “ha reso progressivamente sempre più difficile differenziarsi e ha reso sempre più serrata la competizione con i nuovi produttori globali”

## Delete duplicates

In [13]:
counter = 0
for i, dictionary in enumerate(docs):
    try:
        index = docs.index(dictionary, i+1, len(docs))
        #print("Found a duplicate with index {0} from index {1}".format(index, i))
        del(docs[index])
        counter = counter + 1
    except ValueError:
        None
print("Number of duplicates: ", counter)
print("New length: ", len(docs))

Number of duplicates:  85
New length:  1570


## Build Train corpus for both Models
using entire dataset

In [14]:
FLATTENED_ENTITIES_FIELD_NAME = 'flattened_entities'
TITLE_FIELD_NAME = 'title'
ABSTRACT_FIELD_NAME = 'abstract'

# go throuh entire data-set and create a list of TaggedDocuments titles+abstracts,
# using gensim pre_process feature
train_corpus_abstract = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d[TITLE_FIELD_NAME]+d[ABSTRACT_FIELD_NAME]), [i]) for i, d in enumerate(docs)]

# same thing but with f_entities
train_corpus_entities = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d[FLATTENED_ENTITIES_FIELD_NAME]), [i]) for i, d in enumerate(docs)]

print("Abstract example: \n",train_corpus_abstract[:1])
print("F_Entities example: \n",train_corpus_entities[:1])

Abstract example: 
 [TaggedDocument(words=['fujitsu', 'si', 'separa', 'da', 'pc', 'mobile', 'roma', 'per', 'guadagnare', 'in', 'efficienza', 'tentare', 'di', 'rincorrere', 'una', 'posizione', 'più', 'appetibile', 'sul', 'mercato', 'mobile', 'sul', 'mercato', 'del', 'pc', 'per', 'affrontare', 'anni', 'di', 'profondi', 'cambiamenti', 'per', 'entrambi', 'settori', 'fujitsu', 'ha', 'annunciato', 'lo', 'spinoff', 'delle', 'due', 'divisioni', 'dedicate', 'una', 'notebook', 'pc', 'altra', 'agli', 'smartphone', 'le', 'due', 'aziende', 'che', 'nasceranno', 'ufficialmente', 'nel', 'mese', 'di', 'febbraio', 'del', 'prossimo', 'anno', 'consentiranno', 'all', 'azienda', 'di', 'chiarire', 'le', 'responsabilità', 'nella', 'gestione', 'di', 'agevolare', 'decisioni', 'più', 'rapide', 'della', 'dirigenza', 'di', 'ottenere', 'una', 'maggiore', 'efficienza', 'aspetti', 'fondamentali', 'nel', 'momento', 'in', 'cui', 'la', 'diffusione', 'sempre', 'più', 'di', 'massa', 'sempre', 'più', 'ubiqua', 'di', 'pc', 

## Model Training with more data
here we will train four models over the improved data-set:
    2 models for the title+abstract approach (PV-DM/PV-DBOW)
    2 models for the flattened_entities approach (same)
credits(gensim tutorial): https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [18]:
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"


# let's try training two models at once: Paragraph Vector - Distributed Memory (PV-DM), just like CBOW to W2V
# and Paragraph Vector - Distributed Bag of Words (PV-DBOW), analogous to W2V Skip-gram
epochs = 40
vec_size = 100
entities_alpha = 0.10  
abstract_alpha = 0.05 # here we have much more data
ABSTRACT_MODEL_NAME = "TestModels/d2v_TA_abstract&title"
ENTITIES_MODEL_NAME = "TestModels/d2v_TA_flattened_entities"

abstract_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=2, sample=0, 
            epochs=epochs, workers=cores, comment='abstract-model'),
    # PV-DM w/ default averaging
    Doc2Vec(dm=1, vector_size= vec_size, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs= epochs, workers=cores, alpha= abstract_alpha, comment='alpha=0.05, abstract-model'),
]

entities_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=2, sample=0, 
            epochs=epochs, workers=cores, comment = 'entities-model'),
    # PV-DM w/ default averaging
    Doc2Vec(dm=1, vector_size= vec_size, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs= epochs, workers=cores, alpha= entities_alpha, comment='alpha=0.1, entities-model'),
]

# build our vocabulary of words (all the unique words encountered inside our corpus, needed for training)
for a_model in abstract_models:
    a_model.build_vocab(train_corpus_abstract)
for e_model in entities_models:
    e_model.build_vocab(train_corpus_entities)
print("Vocabulary created!")

# train the models on the given data!
counter = 0
# Train Abstract Models
for a_model in abstract_models:
    print("Training %s" % a_model)
    %time a_model.train(train_corpus_abstract, total_examples=len(train_corpus_abstract), epochs=a_model.epochs)
    a_model.save(ABSTRACT_MODEL_NAME+str(counter)+'.model')
    counter = counter + 1
# Train Flattened_Entities Model
counter = 0
for e_model in entities_models:
    print("Training %s" % e_model)
    %time e_model.train(train_corpus_entities, total_examples=len(train_corpus_entities), epochs=e_model.epochs)
    e_model.save(ENTITIES_MODEL_NAME+str(counter)+'.model')
    counter = counter + 1
print("Models Saved")


Vocabulary created!
Training Doc2Vec("abstract-model",dbow,d100,n5,mc2,t4)
CPU times: user 1min 5s, sys: 552 ms, total: 1min 6s
Wall time: 18.8 s
Training Doc2Vec("alpha=0.05, abstract-model",dm/m,d100,n5,w10,mc2,t4)
CPU times: user 1min 45s, sys: 648 ms, total: 1min 46s
Wall time: 29.2 s
Training Doc2Vec("entities-model",dbow,d100,n5,mc2,t4)
CPU times: user 3.47 s, sys: 180 ms, total: 3.65 s
Wall time: 2.03 s
Training Doc2Vec("alpha=0.1, entities-model",dm/m,d100,n5,w10,mc2,t4)
CPU times: user 5.11 s, sys: 304 ms, total: 5.41 s
Wall time: 2.73 s
Models Saved


# Visualizing Results

## PCA 2D Reduction

In [19]:
def pcaReduction(model = None):
    # let's try to visualize all document_vectors
    # get all vectors of documents we created from model training
    docs_vecs = []

    # docvecs (list of Doc2VecKeyedVectors) 
    # – Vector representations of the documents in the corpus. Each vector has size == vector_size
    for doc in iter(range(0, len(model.docvecs))):
        docs_vecs.append(model.docvecs[doc])

    # loading dataset into Pandas DataFrame
    df = pd.DataFrame.from_records(docs_vecs)
    #df.head()
    #df[['target']]
    
    ## PCA scaling
    # PCA is effected by scale so you need to scale the features in your data before applying PCA. 
    vec_size = model.vector_size
    features = [i for i in range(vec_size)]

    x = df.loc[:, features].values # get features values

    # standardize data
    x = StandardScaler().fit_transform(x) # scale data (especially in case different measures are used)
    
    # build PCA model in 2D
    pca = PCA(n_components=2) # The new components are just the two main dimensions of variation.
    # 2D reduction, 2 main components
    principalComponents = pca.fit_transform(x)

    principalDf = pd.DataFrame(data = principalComponents
                 , columns = ['principal component 1', 'principal component 2'])
    principalDf.head()
    finalDf = principalDf 
    return finalDf

## Interactive Graph with Plotly

In [21]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

## Choose model to show by loading it first

In [30]:
ABSTRACT_MODEL_NAME = "TestModels/d2v_TA_abstract&title"
ENTITIES_MODEL_NAME = "TestModels/d2v_TA_flattened_entities"
model_name = ABSTRACT_MODEL_NAME+'0'+'.model'
model = Doc2Vec.load(model_name)
print('Model correctly loaded')

## PCA REDUCTION, returns pandas dataframe
finalDf = pcaReduction(model)
GRAPH_NAME = "TestModel_Abtract" ## CHANGE THIS EVERYTIME TO SAVE IT TO PLOTLY 'HUB'

Model correctly loaded


## Visualize Data

In [31]:
# we'll draw a scatter graph with labels
traces = []
# let's get the labels
titles = [dictionary['title'] for dictionary in docs]

# each trace will represent a point (squeezed vector from higher dimensions),
# and each point will have the title of the news assigned

#we can't plot all points, so we'll select a subsample of them
for i in range(600, len(finalDf)):
    x , y = finalDf.iat[i, 0], finalDf.iat[i, 1]
    # let's highlight some articles we think should be closer
    color = 'blue'
    text = train_corpus_abstract[i].words
    for social in ['facebook', 'instagram', 'linkedn', 'social_network']:
        if social in text:
            color = 'green'
    trace0 = go.Scatter(
        x = [x],
        y = [y],
        mode = 'markers',
        #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

data = traces 
layout = dict(title = 'PCA Representantion of Abstract Model',
        hovermode= 'closest',
        xaxis= dict(
            title= 'first component',
            ticklen= 5,
            gridwidth= 2,
        ),
        yaxis=dict(
            title= 'second component',
            ticklen= 5,
            gridwidth= 2,
        ),
        showlegend = False
    )
# Plot and embed in ipython notebook!
fig = dict(data = data, layout = layout)
py.iplot(fig, filename=GRAPH_NAME)

In [28]:
ABSTRACT_MODEL_NAME = "TestModels/d2v_TA_abstract&title"
ENTITIES_MODEL_NAME = "TestModels/d2v_TA_flattened_entities"
model_name = ENTITIES_MODEL_NAME+'1'+'.model'
model = Doc2Vec.load(model_name)
print('Model correctly loaded')

## PCA REDUCTION, returns pandas dataframe
finalDf = pcaReduction(model)
GRAPH_NAME = "TestModel_Entities" ## CHANGE THIS EVERYTIME TO SAVE IT TO PLOTLY 'HUB'

Model correctly loaded


# Model Testing

In [34]:
import random
train_corpus = train_corpus_abstract
# Pick a random document from the train corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)

inferred_vector = model.infer_vector(train_corpus[doc_id].words)
similar_docs = model.docvecs.most_similar([inferred_vector], topn=3)

# show the 3 most similar document titles
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
for doc_tag, similarity in similar_docs:
    print("\nSimilar Doc-->(doctag:{0},score:{1}):<<{2}>>".format(doc_tag, similarity, titles[doc_tag]))

Test Document (23): «teamviewer punto informatico gestione file computer utilità assistenza da remoto in tutta sicurezza tutti capita di avere un parente spesso in avanti con gli anni che ha continuamente bisogno di una mano col proprio computer che chiede aiuto dicendo pensaci tu tanto sai come funziona ci metti un attimo magari la soluzione al problema davvero questione di un attimo ma ovviamente si perde un sacco di tempo anche solo per raggiungere il computer la cosa più comoda sarebbe riparare il problema distanza con un sistema di desktop remoto questi sistemi però possono essere relativamente complicati da utilizzare per chi non pratico eccezione rappresentata da teamviewer il programma pensato proprio per dare assistenza remota nel modo più semplice sicuro possibile utilizzo del programma gratuito per uso personale mentre per le aziende richiesto acquisto di una licenza dal punto di vista di chi ha bisogno di aiuto teamviewer piuttosto semplice appare una finestra con uno useri


Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.



# Model assessment over previously seen data
It's not ACTUAL model testing, but we know how the model behaves in recognizing already-seen docs 
(something like train_error_rate)

In [36]:
number_correct = 0
n_examples = 200
train_corpus = train_corpus_abstract
# let's take 200 random examples and see how it performs
for i in range(n_examples):
    # Pick a random document from the train corpus and infer a vector from the model
    doc_id = random.randint(0, len(train_corpus) - 1)

    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    similar_docs = model.docvecs.most_similar([inferred_vector], topn=1)
    # check if the inferred vector is most similar to the original one
    for doc_tag, similarity in similar_docs:
        if doc_tag == doc_id:
            number_correct += 1
correct_rate = (100*number_correct/n_examples)
error_rate = 100 - correct_rate
print("Error rate over {0} random examples from training set: {1}%".format(n_examples, error_rate))
print("Correct prediction over {0} random examples from training set: {1}%".format(n_examples, correct_rate))



Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.



Error rate over 200 random examples from training set: 4.5%
Correct prediction over 200 random examples from training set: 95.5%
