In [32]:
# PCA imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [33]:
# Doc2Vec imports
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim

import json # to open our data file
DATA_FILENAME = "trend_analisys.json"
# open json file
with open(DATA_FILENAME, "r") as json_file:
    json_data = json.load(json_file)
# we're expecting a list now, since our json file is a json array
assert type(json_data) is list

## Remove duplicates from JSON

In [34]:
counter = 0
for i, dictionary in enumerate(json_data):
    try:
        index = json_data.index(dictionary, i+1, len(json_data))
        #print("Found a duplicate with index {0} from index {1}".format(index, i))
        del(json_data[index])
        counter = counter + 1
    except ValueError:
        None
print("Number of duplicates: ", counter)
print("New length: ", len(json_data))

Number of duplicates:  85
New length:  208


## Version 1 - Training Model with Abstract field (whole text)

In [35]:
# we have our json data now, let's go ahead and divide into training and test set
n_examples =  len(json_data)
# how much of the data we're going to be using for training and for testing
# default values: 80% train, 20% test
#TRAIN_DATA_LENGTH = 9 * n_examples // 10
## UPDATE: let's train on the whole dataset
TRAIN_DATA_LENGTH = n_examples
TEST_DATA_LENGTH = n_examples - TRAIN_DATA_LENGTH
ABSTRACT_FIELD_NAME = 'abstract'
TITLE_FIELD_NAME = 'title'

print("Total examples: {0}, number of train examples: {1}, number of test examples: {2}".format(n_examples,TRAIN_DATA_LENGTH, TEST_DATA_LENGTH))

# TODO: Randomize selection of examples, don't just take the first ones
# build training corpus: take the needed abstract, preprocess them (tokenize, delete spaces..)
# and create the TaggedDocument needed for training
# also added title to it 
train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d[TITLE_FIELD_NAME]+d[ABSTRACT_FIELD_NAME]), [i]) for i, d in enumerate(json_data) if i<TRAIN_DATA_LENGTH]

#test_corpus = [gensim.utils.simple_preprocess(
#    d[TITLE_FIELD_NAME]+d[ABSTRACT_FIELD_NAME]) for i, d in enumerate(json_data) if i>TRAIN_DATA_LENGTH]
assert len(train_corpus)==TRAIN_DATA_LENGTH
#print(train_corpus[:1])

## let's also shuffle train set
from random import shuffle
#train_corpus = shuffle(train_corpus[:])

Total examples: 208, number of train examples: 208, number of test examples: 0


## Create and train model using Skip-Gram training
credits for model improvements: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [36]:
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"


# let's try training two models at once: Paragraph Vector - Distributed Memory (PV-DM), just like CBOW to W2V
# and Paragraph Vector - Distributed Bag of Words (PV-DBOW), analogous to W2V Skip-gram
epochs = 40
vec_size = 100
alpha = 0.10  # default= 0.030
MODEL_NAME = "Models/d2v_TA_abstract&title"

models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=2, sample=0, 
            epochs=epochs, workers=cores),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, vector_size= vec_size, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs= epochs, workers=cores, alpha= alpha, comment='alpha=0.1'),
]

# build our vocabulary of words (all the unique words encountered inside our corpus, needed for training)
for model in models:
    print(model)
    model.build_vocab(train_corpus)
print("Vocabulary created!")

# train the models on the given data!
counter = 0
for model in models:
    print("Training %s" % model)
    %time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)
    model.save(MODEL_NAME+str(counter)+'.model')
    counter = counter + 1
print("Models Saved")


Doc2Vec(dbow,d100,n5,mc2,t4)
Doc2Vec("alpha=0.1",dm/m,d100,n5,w10,mc2,t4)
Vocabulary created!
Training Doc2Vec(dbow,d100,n5,mc2,t4)
CPU times: user 7.31 s, sys: 80 ms, total: 7.39 s
Wall time: 2.56 s
Training Doc2Vec("alpha=0.1",dm/m,d100,n5,w10,mc2,t4)
CPU times: user 12 s, sys: 104 ms, total: 12.1 s
Wall time: 4.37 s
Models Saved


## Visualizing Data
credits: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

In [61]:
# let's try to visualize all document_vectors
# get all vectors of documents we created from model training
docs_vecs = []
model = models[0]
# docvecs (list of Doc2VecKeyedVectors) 
# – Vector representations of the documents in the corpus. Each vector has size == vector_size
for doc in iter(range(0, len(model.docvecs))):
    docs_vecs.append(model.docvecs[doc])

# loading dataset into Pandas DataFrame
df = pd.DataFrame.from_records(docs_vecs)
#df.head()

#df[['target']]

In [62]:
# PCA is effected by scale so you need to scale the features in your data before applying PCA. 
features = [i for i in range(vec_size)]

x = df.loc[:, features].values # get features values
#print(x)
# we don't have target here y = df.loc[:,['target']].values # get target values (guess kind of flower/Iris)

# standardize data
x = StandardScaler().fit_transform(x) # scale data (especially in case different measures are used)
# pd.DataFrame(data = x, columns = features).head() # show first data


## 2D Projection with PCA

In [63]:
# build PCA model in 2D
pca = PCA(n_components=2) # The new components are just the two main dimensions of variation.

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf.head()
# these components drawn don't hold a lot of information 'per-se', they're just the result 
# of dimension-reduction

finalDf = principalDf 

## Visualizing Data interactively with Plotly

In [64]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

In [65]:
# we'll draw a scatter graph with labels
traces = []
# let's get the labels
titles = [dictionary[TITLE_FIELD_NAME] for i, dictionary in enumerate(json_data) if i<TRAIN_DATA_LENGTH]
# double check to be sure we got labels just right
#i = 50
#print("TITLE: {0}\n <<{1}>>".format(titles[i], train_corpus[i][:1]))

# each trace will represent a point (squeezed vector from higher dimensions),
# and each point will have the title of the news assigned
for i in range(len(finalDf)):
    trace0 = go.Scatter(
        x = finalDf.loc[i:i, "principal component 1"],
        y = finalDf.loc[i:i, "principal component 2"],
        mode = 'markers',
        #name = 'blue markers',
        marker = dict(
            size = 7,
            color = 'rgba(0, 0, 110, .8)',
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

data = traces 
layout = dict(title = 'PCA Representantion of DocVectors',
        hovermode= 'closest',
        xaxis= dict(
            title= 'first component',
            ticklen= 5,
            gridwidth= 2,
        ),
        yaxis=dict(
            title= 'second component',
            ticklen= 5,
            gridwidth= 2,
        ),
        showlegend = False
    )
# Plot and embed in ipython notebook!
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TA_model_title&abstract_NOKMEANS')

## K-Means clustering on PCA reduced data

In [66]:
# let's try another way for clustering data: K-Mean, an even more popular algorithm,
# which I know from the introductory course on AI, so it might be better 
# to utilize algorithms which I know and can talk about in the presentation
from sklearn.cluster import KMeans

# build k-means model
kmeans = KMeans(n_clusters = 5, max_iter=600, algorithm = 'auto', verbose=0,
               init='k-means++', n_init=15) 

kmeans.fit(principalComponents) # data, as vectors of documents (in 2D)



KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=600,
    n_clusters=5, n_init=15, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

## Visualizing K-Means results

In [67]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go
from scipy.spatial import distance

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

COMPONENT_ONE = "principal component 1"
COMPONENT_TWO = "principal component 2"
centroids = kmeans.cluster_centers_

# each trace will represent a point (squeezed vector from higher dimensions),
# and each point will have the title of the news assigned
for i in range(len(finalDf)):
    # assign a color to each point belonging to a specific cluster
    # computing distance from centroid
    x = finalDf.loc[i:i, "principal component 1"]
    y = finalDf.loc[i:i, "principal component 2"]
    color = 'rgba(0, 0, 180, 0.8)'
    min_d = 10000
    closest_centroid = np.array([])
    
    for centroid in centroids:
        #dist = np.linalg.norm(centroid-np.array(x, y))
        dist = distance.euclidean(np.array(x,y), centroid)
        #print("Distance from centroid: ",dist)
        if dist<min_d:
            min_d = dist
            closest_centroid = centroid
    #print("Prediction: ",closest_centroid)
    if np.array_equal(closest_centroid, centroids[0]):
        color = 'blue'
    elif np.array_equal(closest_centroid, centroids[1]):
        color = 'pink'
    elif np.array_equal(closest_centroid, centroids[2]):
        color = 'yellow'
    elif np.array_equal(closest_centroid, centroids[3]):
        color = 'green'
    else:
        color = 'black'
        
    trace0 = go.Scatter(
        x = x, 
        y = y,
        mode = 'markers',
            #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

# draw centroids
c_trace = go.Scatter(
    x = centroids[:, 0],
    y = centroids[:, 1],
    mode = 'markers',
    marker = dict(
        size = 9,
        color = 'red',
    ),
    text = 'centroid'
)
traces.append(c_trace)

data = traces 
layout = dict(title = 'PCA Representantion of DocVectors',
            hovermode= 'closest',
            xaxis= dict(
                title= 'first component',
                ticklen= 5,
                gridwidth= 2,
            ),
            yaxis=dict(
                title= 'second component',
                ticklen= 5,
                gridwidth= 2,
            ),
            showlegend = False
        )
# Plot and embed in ipython notebook!
    
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TA_model_title&abstract_kmeans_model2')

In [58]:
print(centroids)

[[-1.17454237 -0.70935933]
 [-0.56029901 -3.30869754]
 [ 3.49757356 -0.83890783]
 [ 1.59265357  2.13376192]
 [-2.57751057  1.48418982]]


## Some (basic) testing

In [59]:
# let's check if the model is at least decent,
# which means: is it able to at least recognize news/documents
# it has seen in training?
import random

# Pick a random document from the train corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)

inferred_vector = model.infer_vector(train_corpus[doc_id].words)
similar_docs = model.docvecs.most_similar([inferred_vector], topn=3)

# show the 3 most similar document titles
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
for doc_tag, similarity in similar_docs:
    print("\nSimilar Doc-->(doctag:{0},score:{1}):<<{2}>>".format(doc_tag, similarity, titles[doc_tag]))

Test Document (3): «tempesta su titano le immagini di cassini repubblica itsembra il deserto del sahara nel bel mezzo di una tempesta di sabbia invece siamo su titano nubi di polvere spazzano equatore della più grande luna di saturno nelle immagini catturate dalla sonda cassini che più di un anno dalla sua uscita di scena continua regalare indimenticabili cartoline dallo spazio secondo lo studio pubblicato nature geoscience firmato dal team dell università paris diderot coordinato da sebastien rodriguez la somiglianza con la terra non solo in apparenza visto che dai dati raccolti titano risulta geologicamente attivo le sue dune sono simili alle nostre quelle di marte oltre essere una luna molto attiva spiega rodriguez titano risulta simile marte anche per il ciclo della polvere cumuli spostati dal vento su grandi distanze fanno sì che si formino dune che restituiscono un panorama già visto sulle terre più aride comprese quelle del pianeta rosso grazie alle immagini infrarossi scattate 


Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.



## Some more basic testing on unseen data

In [60]:
# take the first example in the test set,
# and see which document is the most similar

inferred_vector = model.infer_vector(test_corpus[5])
similar_docs = model.docvecs.most_similar([inferred_vector], topn=3)

# show the 3 most similar document titles
print('Test Document : «{}»\n'.format( ' '.join(test_corpus[5])))
for doc_tag, similarity in similar_docs:
    print("\nSimilar Doc-->(doctag:{0},score:{1}):<<{2}>>".format(doc_tag, similarity, titles[doc_tag]))

NameError: name 'test_corpus' is not defined