## Version 2: Model trained on text entities

In [1]:
# PCA imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline


In [8]:
# Doc2Vec imports
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim

import json # to open our data file
DATA_FILENAME = "trend_analisys.json"
# open json file
with open(DATA_FILENAME, "r") as json_file:
    json_data = json.load(json_file)
# we're expecting a list now, since our json file is a json array
assert type(json_data) is list

## Sanitize data from duplicates

In [9]:
unique_json = json_data
counter = 0
for i, dictionary in enumerate(unique_json):
    try:
        index = json_data.index(dictionary, i+1, len(json_data))
        #print("Found a duplicate with index {0} from index {1}".format(index, i))
        del(unique_json[index])
        counter = counter + 1
    except ValueError:
        None
print("Number of duplicates: ", counter)
print("New length: ", len(unique_json))
#print(unique_json)
json_data = unique_json


Number of duplicates:  85
New length:  208


## Version 2-Training model with flattened_entities

In [11]:
# we'll use every data at our disposal for training (we don't have that many)
n_examples =  len(json_data)
TRAIN_DATA_LENGTH = n_examples
FLATTENED_ENTITIES_FIELD = 'flattened_entities'

# TODO: Randomize selection of examples, don't just take the first ones
# build training corpus: take the flattened_entities, preprocess them (tokenize, delete spaces..)
# and create the TaggedDocument needed for training
train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d[FLATTENED_ENTITIES_FIELD]), [i]) for i, d in enumerate(json_data)]

print(train_corpus[10])
print(len(json_data))

TaggedDocument(['antico_egitto', 'archeologia', 'capitale_città', 'edificio', 'egitto', 'guerra', 'il_cairo', 'impero_romano', 'legge', 'menfi_egitto', 'politica', 'rito', 'sarcofago', 'storia_antica', 'sud', 'terme_romane', 'turismo'], [10])
208


## Train 2 models at once - PV-DBOW and PV-DM

In [12]:
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"


# let's try training two models at once: Paragraph Vector - Distributed Memory (PV-DM), just like CBOW to W2V
# and Paragraph Vector - Distributed Bag of Words (PV-DBOW), analogous to W2V Skip-gram
epochs = 40
vec_size = 100
alpha = 0.05  # TODO: TRY HIGHER ALPHA
MODEL_NAME = "Models/d2v_TA_f_entities"

models = [
    # PV-DBOW plain (with default alpha)
    Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=2, sample=0, 
            epochs=epochs, workers=cores),
    # PV-DM w/ default averaging; w/ higher alpha
    Doc2Vec(dm=1, vector_size = vec_size, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs = epochs, workers=cores, alpha = alpha, comment='alpha=0.05'),
]

# build our vocabulary of words (all the unique words encountered inside our corpus, needed for training)
for model in models:
    print(model)
    model.build_vocab(train_corpus)
print("Vocabulary created!")

# train the models on the given data!
counter = 0
for model in models:
    print("Training %s" % model)
    %time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)
    model.save(MODEL_NAME+str(counter)+'.model')
    counter = counter + 1
print("Models Saved")


Doc2Vec(dbow,d100,n5,mc2,t4)
Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t4)
Vocabulary created!
Training Doc2Vec(dbow,d100,n5,mc2,t4)
CPU times: user 260 ms, sys: 0 ns, total: 260 ms
Wall time: 354 ms
Training Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t4)
CPU times: user 412 ms, sys: 4 ms, total: 416 ms
Wall time: 485 ms
Models Saved


## Data Visualization (using PCA and Plotly libs)
all credits in the other file d2v_abstract+title

In [18]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

COMPONENT_ONE = "principal component 1"
COMPONENT_TWO = "principal component 2"

# load model to visualize
model_number = 1
model = Doc2Vec.load(MODEL_NAME+str(model_number)+'.model')

docs_vecs = []
# docvecs (list of Doc2VecKeyedVectors) 
# – Vector representations of the documents in the corpus. Each vector has size == vector_size
for doc in iter(range(0, len(model.docvecs))):
    docs_vecs.append(model.docvecs[doc])
    
# loading dataset into Pandas DataFrame
df = pd.DataFrame.from_records(docs_vecs)    

    ## PCA dimensionality-reduction ##
# PCA is effected by scale so you need to scale the features in your data before applying PCA. 
features = [i for i in range(vec_size)]
x = df.loc[:, features].values # get features values
#print(x)
# standardize data
x = StandardScaler().fit_transform(x) # scale data (especially in case different measures are used)
    
# build PCA model in 2D
pca = PCA(n_components=2) # The new components are just the two main dimensions of variation.
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, 
                           columns = [COMPONENT_ONE, COMPONENT_TWO])
    
# we'll draw a scatter graph with labels
traces = []
# let's get the labels
titles = [dictionary['title'] for dictionary in json_data]
#print(print(titles))
finalDf = principalDf

# double check to be sure we got labels just right
#sample_title = training_set[10]['title']
#sample_text = training_set[10]['flattened_entities']
#inferred_vector = model.infer_vector(gensim.utils.simple_preprocess(sample_text))
#print(gensim.utils.simple_preprocess(sample_text))
# pca sample
#x = np.array(inferred_vector)
#pca = PCA(n_components=2)
#pca_result = pca.fit_transform(x)
#trace_sample = go.Scatter(
#        x = pca_result[0],
#        y = pca_result[1],
#        mode = 'markers',
#            #name = 'blue markers',
#        marker = dict(
#            size = 7,
#            color = 'green',
#        ),
#        text = str(sample_title)
#    )
#traces.append(trace_sample)


    # each trace will represent a point (squeezed vector from higher dimensions),
    # and each point will have the title of the news assigned
for i in range(len(finalDf)):
    color = 'rgba(0, 0, 110, .8)'
    if 'Apple' in titles[i]:
            color = 'red'
    elif 'Amazon' in titles[i]:
        color = 'yellow'
    elif 'Facebook' in titles[i] or 'Instagram' in titles[i]:
        color = 'green'
    elif 'spazio' in json_data[i][FLATTENED_ENTITIES_FIELD]:
        color = 'black'
    
    trace0 = go.Scatter(
        x = finalDf.loc[i:i, "principal component 1"],
        y = finalDf.loc[i:i, "principal component 2"],
        mode = 'markers',
            #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

data = traces 
layout = dict(title = 'PCA Representantion of D2V on Flattened Entities',
            hovermode= 'closest',
            xaxis= dict(
                title= 'first component',
                ticklen= 5,
                gridwidth= 2,
            ),
            yaxis=dict(
                title= 'second component',
                ticklen= 5,
                gridwidth= 2,
            ),
            showlegend = False
        )
# Plot and embed in ipython notebook!
    
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TA_model_flattened_entities-2')

## K-Means on PCA-reduced data

In [7]:
# let's try another way for clustering data: K-Mean, an even more popular algorithm,
# which I know from the introductory course on AI, so it might be better 
# to utilize algorithms which I know and can talk about in the presentation
from sklearn.cluster import KMeans

# build k-means model
kmeans = KMeans(n_clusters = 5, max_iter=600, algorithm = 'auto', verbose=0,
               init='k-means++', n_init=10) 

kmeans.fit(principalComponents) # data, as vectors of documents (in 2D)



KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=600,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

## Visualize centroids 

In [8]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

COMPONENT_ONE = "principal component 1"
COMPONENT_TWO = "principal component 2"
centroids = kmeans.cluster_centers_

# each trace will represent a point (squeezed vector from higher dimensions),
# and each point will have the title of the news assigned
for i in range(len(finalDf)):
    # assign a color to each point belonging to a specific cluster
    # computing distance from centroid
    x = finalDf.loc[i:i, "principal component 1"]
    y = finalDf.loc[i:i, "principal component 2"]
    color = 'rgba(0, 0, 180, 0.8)'
    min_d = 10000
    closest_centroid = []
    for centroid in centroids:
        dist = np.linalg.norm(centroid-np.array(x, y))
        if dist<min_d:
            min_d = dist
            closest_centroid = centroid
    #print("Prediction: ",closest_centroid)
    if np.array_equal(closest_centroid, centroids[0]):
        color = 'blue'
    elif np.array_equal(closest_centroid, centroids[1]):
        color = 'pink'
    elif np.array_equal(closest_centroid, centroids[2]):
        color = 'yellow'
    elif np.array_equal(closest_centroid, centroids[3]):
        color = 'green'
    else:
        color = 'black'
        
    trace0 = go.Scatter(
        x = x, 
        y = y,
        mode = 'markers',
            #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

# draw centroids
c_trace = go.Scatter(
    x = centroids[:, 0],
    y = centroids[:, 1],
    mode = 'markers',
    marker = dict(
        size = 9,
        color = 'red',
    ),
    text = 'centroid'
)
traces.append(c_trace)

data = traces 
layout = dict(title = 'PCA Representantion of DocVectors',
            hovermode= 'closest',
            xaxis= dict(
                title= 'first component',
                ticklen= 5,
                gridwidth= 2,
            ),
            yaxis=dict(
                title= 'second component',
                ticklen= 5,
                gridwidth= 2,
            ),
            showlegend = False
        )
# Plot and embed in ipython notebook!
    
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TA_model_entities_kmeans')