## Version 2: Model trained on text entities

In [1]:
# PCA imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline


In [2]:
# Doc2Vec imports
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim

import json # to open our data file
DATA_FILENAME = "trend_analisys.json"
# open json file
with open(DATA_FILENAME, "r") as json_file:
    json_data = json.load(json_file)
# we're expecting a list now, since our json file is a json array
assert type(json_data) is list

## Sanitize data from duplicates

In [3]:
# utilize dictionary comprehension, since dictionary does not allow duplicate keys

unique_json = { obj['title'] : obj for obj in json_data }.values()
#unique_json = json_data
print(len(unique_json)-len(json_data))

unique_json = json_data
counter = 0
for i, dictionary in enumerate(unique_json):
    try:
        index = json_data.index(dictionary, i+1, len(json_data))
        #print("Found a duplicate with index {0} from index {1}".format(index, i))
        del(unique_json[index])
        counter = counter + 1
    except ValueError:
        None
print("Number of duplicates: ", counter)
print("New length: ", len(unique_json))
#print(unique_json)


-92
Number of duplicates:  85
New length:  208


## K-fold Cross Validation Training

In [5]:
# Since we don't have many data at our disposal, we'll use a k-fold 
# training/testing method, with a fold value of 5
# (we might have to try at least a few values of K and see how the models behave)
from sklearn.model_selection import KFold
import os
import time

KFOLD_VALUE = 5
# args: number of folds, shuffle/don't shuffle data, seed for random permutation
kfold = KFold(KFOLD_VALUE, True, 1)
ENTITIES_FIELD_NAME = "flattened_entities"
model_number = 0
vec_size = 50
MODEL_BASENAME = "Models"+os.sep+"d2v_TA_entities_model"
unique_json = np.array(unique_json)
#print(unique_json[:1])

models = []

for train, test in kfold.split(unique_json):
    # create a TaggedDocument out of the training corpus 
    ## CARE: simple_preprocess also removes 'stop-words' (in english)
    train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d[ENTITIES_FIELD_NAME]), [i]) for i, d in enumerate(unique_json[train])]
    # pre-process test_corpus
    test_corpus = [gensim.utils.simple_preprocess(d[ENTITIES_FIELD_NAME]) for d in unique_json[test]]
    
    #print(train_corpus[:1])
    ## create model ##
    # changes I tried in this model: vector_size, epochs, default learning rate, epochs
    # and no loop while training model
    model = gensim.models.doc2vec.Doc2Vec(vector_size=vec_size,
                                          min_count=2,
                                          epochs=40,
                                          dm = 0) # distributed bag of words (PV-DBOW) is employed
    model.build_vocab(train_corpus)
    #print("Length of train corpus: ", len(train_corpus))
    ## Train model ##
    print("Model %s training time: " %(model_number))
    
    time_start = time.clock()
    #for epoch in range(model.epochs):
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    time_elapsed = (time.clock() - time_start)
    print(str(time_elapsed))
    
    model.save(MODEL_BASENAME + str(model_number) + '.model')
    print("Model %s Saved" %(model_number))
    model_number = model_number + 1
    
    ## Test model in order to choose best one ##
    
    # add model to list of models
    models.append((model, unique_json[train]))
    

Model 0 training time: 
0.178477
Model 0 Saved
Model 1 training time: 
0.19014599999999948
Model 1 Saved
Model 2 training time: 
0.1763439999999994
Model 2 Saved
Model 3 training time: 
0.18684499999999993
Model 3 Saved
Model 4 training time: 
0.2258849999999999
Model 4 Saved


In [None]:
## Test given model, over given test set
# TODO: put some meaningful checks here (data type, raise errors and such..)
def test_model (model, test_set):
    # we're expecting flattened_entities here,
    # which are contained in the form of a string separated by spaces by default
    for document in test_set:
        # infer_vector() does not take a string, but rather a list of string tokens
        # The str.split() method without an argument splits on whitespace
        inferred_vector = model.infer_vector(document.split())
        

## Data Visualization

In [6]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

COMPONENT_ONE = "principal component 1"
COMPONENT_TWO = "principal component 2"

docs_vecs = []
model, training_set = models[2]
# docvecs (list of Doc2VecKeyedVectors) 
# – Vector representations of the documents in the corpus. Each vector has size == vector_size
for doc in iter(range(0, len(model.docvecs))):
    docs_vecs.append(model.docvecs[doc])
#print(docs_vecs)
# loading dataset into Pandas DataFrame
df = pd.DataFrame.from_records(docs_vecs)    
    ## PCA dimensionality-reduction ##
# PCA is effected by scale so you need to scale the features in your data before applying PCA. 
features = [i for i in range(vec_size)]
x = df.loc[:, features].values # get features values
#print(x)
# standardize data
x = StandardScaler().fit_transform(x) # scale data (especially in case different measures are used)
    
# build PCA model in 2D
pca = PCA(n_components=2) # The new components are just the two main dimensions of variation.
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, 
                           columns = [COMPONENT_ONE, COMPONENT_TWO])
    
# we'll draw a scatter graph with labels
traces = []
# let's get the labels
titles = [dictionary['title'] for dictionary in training_set]
#print(print(titles))
finalDf = principalDf

# double check to be sure we got labels just right
#sample_title = training_set[10]['title']
#sample_text = training_set[10]['flattened_entities']
#inferred_vector = model.infer_vector(gensim.utils.simple_preprocess(sample_text))
#print(gensim.utils.simple_preprocess(sample_text))
# pca sample
#x = np.array(inferred_vector)
#pca = PCA(n_components=2)
#pca_result = pca.fit_transform(x)
#trace_sample = go.Scatter(
#        x = pca_result[0],
#        y = pca_result[1],
#        mode = 'markers',
#            #name = 'blue markers',
#        marker = dict(
#            size = 7,
#            color = 'green',
#        ),
#        text = str(sample_title)
#    )
#traces.append(trace_sample)


    # each trace will represent a point (squeezed vector from higher dimensions),
    # and each point will have the title of the news assigned
for i in range(len(finalDf)):
    color = 'rgba(0, 0, 110, .8)'
    if 'Apple' in titles[i]:
            color = 'rgba(120, 0, 0, .9)'
    elif 'Amazon' in titles[i]:
        color = 'yellow'
    elif 'Facebook' in titles[i] or 'Instagram' in titles[i]:
        color = 'green'
    elif 'spazio' in training_set[i][ENTITIES_FIELD_NAME]:
        color = 'black'
    
    trace0 = go.Scatter(
        x = finalDf.loc[i:i, "principal component 1"],
        y = finalDf.loc[i:i, "principal component 2"],
        mode = 'markers',
            #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

data = traces 
layout = dict(title = 'PCA Representantion of DocVectors',
            hovermode= 'closest',
            xaxis= dict(
                title= 'first component',
                ticklen= 5,
                gridwidth= 2,
            ),
            yaxis=dict(
                title= 'second component',
                ticklen= 5,
                gridwidth= 2,
            ),
            showlegend = False
        )
# Plot and embed in ipython notebook!
    
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TA_model_entities-scatter')

## K-Means on PCA-reduced data

In [12]:
# let's try another way for clustering data: K-Mean, an even more popular algorithm,
# which I know from the introductory course on AI, so it might be better 
# to utilize algorithms which I know and can talk about in the presentation
from sklearn.cluster import KMeans

# build k-means model
kmeans = KMeans(n_clusters = 5, max_iter=600, algorithm = 'auto', verbose=0,
               init='k-means++', n_init=10) 

kmeans.fit(principalComponents) # data, as vectors of documents (in 2D)



KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=600,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

## Visualize centroids 

In [30]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

COMPONENT_ONE = "principal component 1"
COMPONENT_TWO = "principal component 2"
centroids = kmeans.cluster_centers_

# each trace will represent a point (squeezed vector from higher dimensions),
# and each point will have the title of the news assigned
for i in range(len(finalDf)):
    # assign a color to each point belonging to a specific cluster
    # computing distance from centroid
    x = finalDf.loc[i:i, "principal component 1"]
    y = finalDf.loc[i:i, "principal component 2"]
    color = 'rgba(0, 0, 180, 0.8)'
    min_d = 10000
    closest_centroid = []
    for centroid in centroids:
        dist = np.linalg.norm(centroid-np.array(x, y))
        if dist<min_d:
            min_d = dist
            closest_centroid = centroid
    #print("Prediction: ",closest_centroid)
    if np.array_equal(closest_centroid, centroids[0]):
        color = 'blue'
    elif np.array_equal(closest_centroid, centroids[1]):
        color = 'red'
    elif np.array_equal(closest_centroid, centroids[2]):
        color = 'yellow'
    elif np.array_equal(closest_centroid, centroids[3]):
        color = 'green'
    else:
        color = 'black'
        
    trace0 = go.Scatter(
        x = x, 
        y = y,
        mode = 'markers',
            #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

# draw centroids
c_trace = go.Scatter(
    x = centroids[:, 0],
    y = centroids[:, 1],
    mode = 'markers',
    marker = dict(
        size = 9,
        color = 'red',
    ),
    text = 'centroid'
)
traces.append(c_trace)

data = traces 
layout = dict(title = 'PCA Representantion of DocVectors',
            hovermode= 'closest',
            xaxis= dict(
                title= 'first component',
                ticklen= 5,
                gridwidth= 2,
            ),
            yaxis=dict(
                title= 'second component',
                ticklen= 5,
                gridwidth= 2,
            ),
            showlegend = False
        )
# Plot and embed in ipython notebook!
    
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TA_model_entities_kmeans')