In [30]:
# PCA imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [31]:
# Doc2Vec imports
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim

import json # to open our data file
DATA_FILENAME = "trend_analisys.json"
# open json file
with open(DATA_FILENAME, "r") as json_file:
    json_data = json.load(json_file)
# we're expecting a list now, since our json file is a json array
assert type(json_data) is list

## Remove duplicates from JSON

In [32]:
counter = 0
for i, dictionary in enumerate(json_data):
    try:
        index = json_data.index(dictionary, i+1, len(json_data))
        #print("Found a duplicate with index {0} from index {1}".format(index, i))
        del(json_data[index])
        counter = counter + 1
    except ValueError:
        None
print("Number of duplicates: ", counter)
print("New length: ", len(json_data))

Number of duplicates:  85
New length:  208


## Version 1 - Training Model with Abstract field (whole text)

In [33]:
# we have our json data now, let's go ahead and divide into training and test set
n_examples =  len(json_data)
# how much of the data we're going to be using for training and for testing
# default values: 80% train, 20% test
TRAIN_DATA_LENGTH = 9 * n_examples // 10
TEST_DATA_LENGTH = n_examples - TRAIN_DATA_LENGTH
ABSTRACT_FIELD_NAME = 'abstract'
TITLE_FIELD_NAME = 'title'

print("Total examples: {0}, number of train examples: {1}, number of test examples: {2}".format(n_examples,TRAIN_DATA_LENGTH, TEST_DATA_LENGTH))

# TODO: Randomize selection of examples, don't just take the first ones
# build training corpus: take the needed abstract, preprocess them (tokenize, delete spaces..)
# and create the TaggedDocument needed for training
# also added title to it 
train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d[TITLE_FIELD_NAME]+d[ABSTRACT_FIELD_NAME]), [i]) for i, d in enumerate(json_data) if i<TRAIN_DATA_LENGTH]

test_corpus = [gensim.utils.simple_preprocess(
    d[TITLE_FIELD_NAME]+d[ABSTRACT_FIELD_NAME]) for i, d in enumerate(json_data) if i>TRAIN_DATA_LENGTH]
assert len(train_corpus)==TRAIN_DATA_LENGTH
#print(train_corpus[:1])

Total examples: 208, number of train examples: 187, number of test examples: 21


## Create and train model using Skip-Gram training

In [97]:
# create the doc2vec model
# TODO: tune this parameters (personally, I think we could use a bigger vec_size, like 50)
max_epochs = 40
vec_size = 50
alpha = 0.030
MODEL_NAME = "Models/d2v_TA_abstract&title.model"

model = Doc2Vec(vector_size=vec_size,
                min_count=2, # words that appear less than twice in the corpus are ignored
                dm=1) # Skip-Gram
                
# build our vocabulary of words (all the unique words encountered inside our corpus, needed for training)
model.build_vocab(train_corpus)

# train the model on the given data!
%time model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)

model.save(MODEL_NAME)
print("Model Saved")


CPU times: user 780 ms, sys: 8 ms, total: 788 ms
Wall time: 315 ms
Model Saved


## Visualizing Data
credits: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

In [98]:
# let's try to visualize all document_vectors
# get all vectors of documents we created from model training
docs_vecs = []
# docvecs (list of Doc2VecKeyedVectors) 
# – Vector representations of the documents in the corpus. Each vector has size == vector_size
for doc in iter(range(0, len(model.docvecs))):
    docs_vecs.append(model.docvecs[doc])

# loading dataset into Pandas DataFrame
df = pd.DataFrame.from_records(docs_vecs)
#df.head()

#df[['target']]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.119387,-0.346534,-0.273014,-1.118393,-0.581692,0.611214,0.127722,-0.202269,0.153298,-1.062154,...,-0.108225,-0.207567,0.727272,0.585349,0.222956,0.8754,0.151211,0.468302,-1.184027,-0.635066
1,0.157088,-0.485685,-0.351238,-1.503304,-0.765628,0.824385,0.154027,-0.27114,0.219725,-1.413674,...,-0.137261,-0.2716,0.996338,0.800607,0.303293,1.17223,0.214158,0.628052,-1.598117,-0.848087
2,0.065053,-0.219481,-0.163035,-0.659115,-0.348154,0.363078,0.066665,-0.11728,0.105474,-0.621904,...,-0.074848,-0.132332,0.43738,0.348238,0.134807,0.510593,0.081707,0.281237,-0.704783,-0.378905
3,0.053392,-0.160366,-0.137524,-0.514732,-0.276063,0.295708,0.057035,-0.082042,0.073855,-0.485232,...,-0.053983,-0.096164,0.35096,0.277355,0.106803,0.41725,0.065572,0.212934,-0.555022,-0.293616
4,0.067006,-0.206694,-0.15804,-0.636199,-0.326443,0.336898,0.075776,-0.118137,0.083223,-0.589575,...,-0.061092,-0.12488,0.408338,0.342582,0.135183,0.483082,0.088697,0.266961,-0.668985,-0.364236


In [99]:
# PCA is effected by scale so you need to scale the features in your data before applying PCA. 
features = [i for i in range(vec_size)]

x = df.loc[:, features].values # get features values
#print(x)
# we don't have target here y = df.loc[:,['target']].values # get target values (guess kind of flower/Iris)

# standardize data
x = StandardScaler().fit_transform(x) # scale data (especially in case different measures are used)
# pd.DataFrame(data = x, columns = features).head() # show first data


[[ 0.11938673 -0.34653431 -0.2730135  ...  0.46830156 -1.18402719
  -0.63506573]
 [ 0.15708816 -0.48568472 -0.35123813 ...  0.62805247 -1.59811747
  -0.84808689]
 [ 0.06505304 -0.21948071 -0.1630352  ...  0.28123662 -0.7047835
  -0.37890479]
 ...
 [ 0.06400771 -0.20057003 -0.15090545 ...  0.26194432 -0.66003615
  -0.34150797]
 [ 0.07232197 -0.17000496 -0.12539493 ...  0.22952157 -0.57459134
  -0.30948418]
 [ 0.10433161 -0.26795733 -0.21095505 ...  0.36031994 -0.9214952
  -0.48799506]]


## 2D Projection with PCA

In [100]:
# build PCA model in 2D
pca = PCA(n_components=2) # The new components are just the two main dimensions of variation.

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf.head()
# these components drawn don't hold a lot of information 'per-se', they're just the result 
# of dimension-reduction

finalDf = principalDf 

## Visualizing Data interactively with Plotly

In [89]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

In [101]:
# we'll draw a scatter graph with labels
traces = []
# let's get the labels
titles = [dictionary[TITLE_FIELD_NAME] for i, dictionary in enumerate(json_data) if i<TRAIN_DATA_LENGTH]
# double check to be sure we got labels just right
#i = 50
#print("TITLE: {0}\n <<{1}>>".format(titles[i], train_corpus[i][:1]))

# each trace will represent a point (squeezed vector from higher dimensions),
# and each point will have the title of the news assigned
for i in range(len(finalDf)):
    trace0 = go.Scatter(
        x = finalDf.loc[i:i, "principal component 1"],
        y = finalDf.loc[i:i, "principal component 2"],
        mode = 'markers',
        #name = 'blue markers',
        marker = dict(
            size = 7,
            color = 'rgba(0, 0, 110, .8)',
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

data = traces 
layout = dict(title = 'PCA Representantion of DocVectors',
        hovermode= 'closest',
        xaxis= dict(
            title= 'first component',
            ticklen= 5,
            gridwidth= 2,
        ),
        yaxis=dict(
            title= 'second component',
            ticklen= 5,
            gridwidth= 2,
        ),
        showlegend = False
    )
# Plot and embed in ipython notebook!
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TA_model-scatter')

## Some (basic) testing

In [109]:
# let's check if the model is at least decent,
# which means: is it able to at least recognize news/documents
# it has seen in training?
import random

# Pick a random document from the train corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)

inferred_vector = model.infer_vector(train_corpus[doc_id].words)
similar_docs = model.docvecs.most_similar([inferred_vector], topn=3)

# show the 3 most similar document titles
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
for doc_tag, similarity in similar_docs:
    print("\nSimilar Doc-->(doctag:{0},score:{1}):<<{2}>>".format(doc_tag, similarity, titles[doc_tag]))

Test Document (181): «iphone milano tutti in fila per nuovi modelli tlc ansa itda questa mattina anche in italia possibile acquistare nuovi iphone xs xs max per occasione lo store apple di piazza liberty ha aperto le porte al pubblico già partire dalle primi clienti hanno così potuto vedere per la prima volta dal vivo nuovi modelli presentati dall azienda di cupertino anche se il sistema di pre ordine online ha evitato che si creassero lunghe code all esterno chi come vitaly giovane studente russo all università bocconi di milano che si posizionato di fronte all ingresso già ieri pomeriggio assicurandosi così di essere il primo tra clienti senza prenotazione ad accedere al negozio una lunga attesa mitigata dal caffè caldo servito tutti clienti che pazientemente attendono il loro turno»


Similar Doc-->(doctag:6,score:0.9995049238204956):<<Cinque cose da fare per gestire bene la sicurezza degli account social aziendali - Il Sole 24 ORE>>

Similar Doc-->(doctag:179,score:0.99944150447845


Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.



## Some more basic testing on unseen data

In [110]:
# take the first example in the test set,
# and see which document is the most similar

inferred_vector = model.infer_vector(test_corpus[5])
similar_docs = model.docvecs.most_similar([inferred_vector], topn=3)

# show the 3 most similar document titles
print('Test Document : «{}»\n'.format( ' '.join(test_corpus[5])))
for doc_tag, similarity in similar_docs:
    print("\nSimilar Doc-->(doctag:{0},score:{1}):<<{2}>>".format(doc_tag, similarity, titles[doc_tag]))

Test Document : «le auto guida autonoma migliorano il flusso del trafficol introduzione di autoveicoli guida autonoma nel traffico può contribuire migliorarne il flusso diminuire il consumo di carburante questo secondo una ricerca della rutgers university camden recentemente presentata washington una ricerca la national science foundation un agenzia governativa statunitense che si occupa della promozione del progresso scientifico dell avanzamento della salute prosperità welfare all interno del paese ha invitato gli scienziati discutere del proprio lavoro con leader dell industria automobilistica gli ufficiali governativi in occasione del washington auto show tenutosi gennaio un team di ricercatori esperti in teoria del flusso di traffico teoria del controllo robotica sistemi cyber fisici ingegneria dei trasporti hanno mostrato ai rappresentanti politici presenti all evento come le automobili guida autonoma possano aiutare prevenire gli ingorghi stradali farli scomparire una volta forma


Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.

