## Version 2: Model trained on text entities

In [1]:
# PCA imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline


In [5]:
# Doc2Vec imports
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
import json


DATA_FILENAME = "dump_solr.json"
DATA_FILENAME2 = "trend_analisys.json"
# open json file
with open(DATA_FILENAME, "r") as json_file:
    json_data = json.load(json_file)

# open 'old' json file
with open(DATA_FILENAME2, "r") as json_file:
    json_data_old = json.load(json_file)

# we're expecting a dictionary now, since our json file is a json object
assert type(json_data) is dict

# we're expecting a list this time, for the way it is formatted 
assert type(json_data_old) is list

In [6]:
## let's now retrieve the meaningful part of the json document
# response{}--->docs[]

docs = json_data['response']['docs']
print("Number of documents in new json: ",len(docs))
print("Number of documents in old json: ",len(json_data_old))

# let's use both data dumps, make a single list

docs = docs + json_data_old

print("New length: ", len(docs))
print(docs[:1])

for i, dictionary in enumerate(docs):
    for field in ['title', 'abstract', 'flattened_entities']:
        if isinstance(dictionary[field], list):
            # re-format data to hold string instead of single-list item
            docs[i][field] = dictionary[field][0]

Number of documents in new json:  1377
Number of documents in old json:  293
New length:  1670
[{'fonte_dati': ['trend_analisys'], 'id': 'https://www.ilpost.it/internet/page/3/', 'ta_id': [1, 2, 3], 'title': ['Internet - Pagina 3 di 94 - Il Post'], 'abstract': ["  Soprattutto in Cina, alcuni impiegati dell'azienda avrebbero preso soldi dai venditori per fornire dati sugli utenti, eliminare recensioni negative e avvantaggiarli nei risultati    Un balletto da fare per strada, vicino a una macchina e su una canzone di Drake è diventato virale nelle ultime settimane, costringendo la polizia a prendere precauzioni in diversi paesi    Per errore un registro interno all'azienda salvava le password senza nasonderle: non sono state trovate prove di violazioni, ma Twitter consiglia ugualmente di intervenire  "], 'url': ['https://www.ilpost.it/internet/page/3/'], 'website': ['ilpost.it'], 'timestamp': [1528360812000], 'publication_date': ['2018-06-07T08:40:12Z'], 'flattened_entities': ['azienda c

## Extract test corpus from the whole data-set

In [7]:
import random
random.shuffle(docs)
test_corpus_length = 400
# take first examples
test_corpus = docs[:test_corpus_length]
docs = docs[test_corpus_length:]

filename = 'TOWL_f_entitites_test_corpus.json'
# save test file to json
with open(filename, 'w') as file:
    json.dump(test_corpus,file)
print("New data length: {0}, test set data length: {1}".format(len(docs), len(test_corpus)))

New data length: 1270, test set data length: 400


## Don't remove duplicates in this model

In [9]:
"""unique_json = json_data
counter = 0
for i, dictionary in enumerate(unique_json):
    try:
        index = json_data.index(dictionary, i+1, len(json_data))
        #print("Found a duplicate with index {0} from index {1}".format(index, i))
        del(unique_json[index])
        counter = counter + 1
    except ValueError:
        None
print("Number of duplicates: ", counter)
print("New length: ", len(unique_json))
#print(unique_json)
json_data = unique_json"""


Number of duplicates:  85
New length:  208


## Instead, re-inforce entites 'short document type' by duplicating all data

In [8]:
docs = docs + docs + docs + docs
import random
# shuffle new list
random.shuffle(docs)
print("New length: ", len(docs))

New length:  5080


## Version 2-Training model with flattened_entities

In [9]:
# we'll use every data at our disposal for training
n_examples =  len(json_data)
TRAIN_DATA_LENGTH = n_examples
FLATTENED_ENTITIES_FIELD = 'flattened_entities'

# build training corpus: take the flattened_entities, preprocess them (tokenize, delete spaces..)
# and create the TaggedDocument needed for training
train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d[FLATTENED_ENTITIES_FIELD]), [i]) for i, d in enumerate(docs)]

print(train_corpus[10])


TaggedDocument(['adecco', 'automobile', 'blockchain', 'energia', 'fastweb', 'italia', 'microsoft', 'milano', 'nespresso', 'ossimoro', 'sky_italia', 'sole', 'tecnologia', 'wired'], [10])


## Train 2 models at once - PV-DBOW and PV-DM

In [10]:
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"


# let's try training two models at once: Paragraph Vector - Distributed Memory (PV-DM), just like CBOW to W2V
# and Paragraph Vector - Distributed Bag of Words (PV-DBOW), analogous to W2V Skip-gram
epochs = 50
vec_size = 100
alpha = 0.2 # TODO: TRY HIGHER ALPHA
MODEL_NAME = "Models/d2v_TA_f_entities"

models = [
    # min_count = 1; don't discard any word at all, we're using entites, meaningful words by default
    # PV-DBOW plain (with default alpha)
    Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=1, sample=0, 
            epochs=epochs, workers=cores),
    # PV-DM w/ higher alpha
    Doc2Vec(dm=1, vector_size = vec_size, window=10, negative=5, hs=0, min_count=1, sample=0, 
            epochs = epochs, workers=cores, alpha = alpha, comment='alpha=0.2'),
]

# build our vocabulary of words (all the unique words encountered inside our corpus, needed for training)
for model in models:
    print(model)
    model.build_vocab(train_corpus)
print("Vocabulary created!")

# train the models on the given data!
counter = 0
for model in models:
    print("Training %s" % model)
    %time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)
    model.save(MODEL_NAME+str(counter)+'.model')
    counter = counter + 1
print("Models Saved")


Doc2Vec(dbow,d100,n5,t4)
Doc2Vec("alpha=0.2",dm/m,d100,n5,w10,t4)
Vocabulary created!
Training Doc2Vec(dbow,d100,n5,t4)
CPU times: user 17.8 s, sys: 1.09 s, total: 18.8 s
Wall time: 10.6 s
Training Doc2Vec("alpha=0.2",dm/m,d100,n5,w10,t4)
CPU times: user 23.9 s, sys: 2.62 s, total: 26.5 s
Wall time: 14.9 s
Models Saved


## Data Visualization (using PCA and Plotly libs)
all credits in the other file d2v_abstract+title

In [18]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

COMPONENT_ONE = "principal component 1"
COMPONENT_TWO = "principal component 2"

# load model to visualize
model_number = 1
model = Doc2Vec.load(MODEL_NAME+str(model_number)+'.model')

docs_vecs = []
# docvecs (list of Doc2VecKeyedVectors) 
# – Vector representations of the documents in the corpus. Each vector has size == vector_size
for doc in iter(range(0, len(model.docvecs))):
    docs_vecs.append(model.docvecs[doc])
    
# loading dataset into Pandas DataFrame
df = pd.DataFrame.from_records(docs_vecs)    

    ## PCA dimensionality-reduction ##
# PCA is effected by scale so you need to scale the features in your data before applying PCA. 
features = [i for i in range(vec_size)]
x = df.loc[:, features].values # get features values
#print(x)
# standardize data
x = StandardScaler().fit_transform(x) # scale data (especially in case different measures are used)
    
# build PCA model in 2D
pca = PCA(n_components=2) # The new components are just the two main dimensions of variation.
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, 
                           columns = [COMPONENT_ONE, COMPONENT_TWO])
    
# we'll draw a scatter graph with labels
traces = []
# let's get the labels
titles = [dictionary['title'] for dictionary in json_data]
#print(print(titles))
finalDf = principalDf

# double check to be sure we got labels just right
#sample_title = training_set[10]['title']
#sample_text = training_set[10]['flattened_entities']
#inferred_vector = model.infer_vector(gensim.utils.simple_preprocess(sample_text))
#print(gensim.utils.simple_preprocess(sample_text))
# pca sample
#x = np.array(inferred_vector)
#pca = PCA(n_components=2)
#pca_result = pca.fit_transform(x)
#trace_sample = go.Scatter(
#        x = pca_result[0],
#        y = pca_result[1],
#        mode = 'markers',
#            #name = 'blue markers',
#        marker = dict(
#            size = 7,
#            color = 'green',
#        ),
#        text = str(sample_title)
#    )
#traces.append(trace_sample)


    # each trace will represent a point (squeezed vector from higher dimensions),
    # and each point will have the title of the news assigned
for i in range(len(finalDf)):
    color = 'rgba(0, 0, 110, .8)'
    if 'Apple' in titles[i]:
            color = 'red'
    elif 'Amazon' in titles[i]:
        color = 'yellow'
    elif 'Facebook' in titles[i] or 'Instagram' in titles[i]:
        color = 'green'
    elif 'spazio' in json_data[i][FLATTENED_ENTITIES_FIELD]:
        color = 'black'
    
    trace0 = go.Scatter(
        x = finalDf.loc[i:i, "principal component 1"],
        y = finalDf.loc[i:i, "principal component 2"],
        mode = 'markers',
            #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

data = traces 
layout = dict(title = 'PCA Representantion of D2V on Flattened Entities',
            hovermode= 'closest',
            xaxis= dict(
                title= 'first component',
                ticklen= 5,
                gridwidth= 2,
            ),
            yaxis=dict(
                title= 'second component',
                ticklen= 5,
                gridwidth= 2,
            ),
            showlegend = False
        )
# Plot and embed in ipython notebook!
    
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TA_model_flattened_entities-2')

## K-Means on PCA-reduced data

In [7]:
# let's try another way for clustering data: K-Mean, an even more popular algorithm,
# which I know from the introductory course on AI, so it might be better 
# to utilize algorithms which I know and can talk about in the presentation
from sklearn.cluster import KMeans

# build k-means model
kmeans = KMeans(n_clusters = 5, max_iter=600, algorithm = 'auto', verbose=0,
               init='k-means++', n_init=10) 

kmeans.fit(principalComponents) # data, as vectors of documents (in 2D)



KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=600,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

## Visualize centroids 

In [8]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

COMPONENT_ONE = "principal component 1"
COMPONENT_TWO = "principal component 2"
centroids = kmeans.cluster_centers_

# each trace will represent a point (squeezed vector from higher dimensions),
# and each point will have the title of the news assigned
for i in range(len(finalDf)):
    # assign a color to each point belonging to a specific cluster
    # computing distance from centroid
    x = finalDf.loc[i:i, "principal component 1"]
    y = finalDf.loc[i:i, "principal component 2"]
    color = 'rgba(0, 0, 180, 0.8)'
    min_d = 10000
    closest_centroid = []
    for centroid in centroids:
        dist = np.linalg.norm(centroid-np.array(x, y))
        if dist<min_d:
            min_d = dist
            closest_centroid = centroid
    #print("Prediction: ",closest_centroid)
    if np.array_equal(closest_centroid, centroids[0]):
        color = 'blue'
    elif np.array_equal(closest_centroid, centroids[1]):
        color = 'pink'
    elif np.array_equal(closest_centroid, centroids[2]):
        color = 'yellow'
    elif np.array_equal(closest_centroid, centroids[3]):
        color = 'green'
    else:
        color = 'black'
        
    trace0 = go.Scatter(
        x = x, 
        y = y,
        mode = 'markers',
            #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

# draw centroids
c_trace = go.Scatter(
    x = centroids[:, 0],
    y = centroids[:, 1],
    mode = 'markers',
    marker = dict(
        size = 9,
        color = 'red',
    ),
    text = 'centroid'
)
traces.append(c_trace)

data = traces 
layout = dict(title = 'PCA Representantion of DocVectors',
            hovermode= 'closest',
            xaxis= dict(
                title= 'first component',
                ticklen= 5,
                gridwidth= 2,
            ),
            yaxis=dict(
                title= 'second component',
                ticklen= 5,
                gridwidth= 2,
            ),
            showlegend = False
        )
# Plot and embed in ipython notebook!
    
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TA_model_entities_kmeans')

# DBSCAN

## Load model

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import  gensim

MODEL_NAME = 'Models/d2v_TA_f_entities0.model'
MODEL_TWO = 'Models/d2v_TA_f_entities1.model'
#model = Doc2Vec.load(MODEL_ONE)
model = Doc2Vec.load(MODEL_TWO)
inferred_vectors = []
# print out dimension of the vocabulary 
print(len(model.wv.vocab))
#print(model.most_similar(positive=['re', 'donna'], negative=['uomo']))

4610


  del sys.path[0]


KeyError: "word 're' not in vocabulary"

In [11]:
from sklearn.cluster import DBSCAN

def perform_dbscan(eps = 0.4, min_samples = 4, metric = 'euclidean', algorithm = 'auto', data = None, verbose = True
                  , titles = None):
    """perform DBSCAN over given data, using given parametrs. Returns dbscan object."""
    db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm).fit(data)

    #print("Core samples: ")
    #for i in db.core_sample_indices_ :
    #    print(titles[i]+"\n")

    # labels will print out the number of the cluster each example belongs to;
    # -1 if the vector is considered noise (not belonging to any cluster)
    #print("Labels: ", db.labels_)
    
    if verbose:
        print("##Clusters##")
        cluster = [[]]
        noise = []
        noise_r = []
        for i, label in enumerate(db.labels_):
            if label != -1:
                try:
                    cluster[label].append(titles[i])
                except Exception as e:
                    cluster.append([titles[i]])
            else:
                noise.append(titles[i])
                noise_r.append(i)
        for list_ in cluster:
            print("Cluster:", list_)
        print("Noise: ", noise)

        print("DBSCAN finished.\n")
    return db

In [16]:
## load test-corpus
import json
import gensim

with open('TOWL_f_entities_test_corpus.json', 'r') as json_file:
    json_data = json.load(json_file)
# we're expecting a list
assert isinstance(json_data, list)
titles = [dictionary['title'] for dictionary in json_data]
test_corpus = [gensim.utils.simple_preprocess(d['flattened_entities']) for d in json_data]
print("Number of documents: ", len(test_corpus))
print(test_corpus[:2])

inferred_vectors = [model.infer_vector(doc) for doc in test_corpus]

Number of documents:  400
[['africa', 'classe_sociale', 'economia', 'ibm', 'multinazionale', 'politica', 'tempo', 'wi', 'fi'], ['playstation', 'sport']]


In [17]:
db = perform_dbscan(eps = 0.48, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = inferred_vectors, verbose = False, titles = titles)
print(db.labels_)

[ 0  0  0  0 -1  0  0  0  0  0 -1  0 -1  0  0  0  0  0  0  0  0  0  1  0
  0 -1 -1  0  0  0  0  0 -1 -1 -1  0  0  2 -1  0 -1  0 -1  0  0  0 -1  0
  0  0  0  0 -1  0 -1 -1  0  0 -1 -1  0 -1  0 -1 -1 -1  0 -1  0  0 -1  0
  3  0  0  0 -1  0 -1  0  0  1  0 -1 -1 -1  4  0  0 -1  0 -1 -1  0  0  0
  0 -1  0  0 -1  0  0  0  0  0  5  0  5  0  0 -1 -1  0  0  0  0  0 -1  0
  0 -1  0 -1  0  0 -1  0 -1 -1  0  0 -1 -1  0  0  0 -1  0  0  0 -1  0  0
  0  0 -1  0  0  0  0  0  0  0  0  0 -1 -1 -1 -1 -1  0  0  0 -1 -1 -1  2
 -1  0 -1  0  0  0  0  0  0  0  0 -1  0 -1 -1 -1  0  0 -1  0  0  0  0 -1
  0  0  0 -1  0  0  6 -1  0  0  0 -1  0  0  0 -1 -1 -1  0  0 -1  0 -1 -1
  0  0  0 -1  0  0  0 -1  0  0  0  0  0 -1  0  0  0  0  0 -1  0  0  0  0
 -1 -1  0 -1 -1 -1  0  0  0 -1 -1  0  0  0 -1 -1  0  0 -1 -1 -1  0  0 -1
  0 -1 -1  0  0  3 -1 -1 -1 -1 -1 -1  0  0  0 -1 -1  0  0  0  0  0  0  0
  0  0 -1 -1  0  0  0  0 -1  0  0  0  0  0 -1  0  0  0  0 -1  0 -1 -1 -1
  0 -1  6  0 -1  0  0  0  0  0  0  0  0  0  0 -1 -1

In [25]:
# let's take a few documents, randomly chosen from the inferred vectors
import random
subsample_size = 10
subsamples = []
titles2 = []
for i in range(subsample_size):
    index = random.randint(0, len(inferred_vectors)-1)
    subsamples.append(inferred_vectors[index])
    titles2.append(titles[index])
    
db = perform_dbscan(eps = 0.4, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = subsamples, verbose = True, titles = titles2)

##Clusters##
Cluster: ['IoT: Notizie del giorno - News online su Punto Informatico', 'TMux | Punto Informatico', 'Il coding senza età: una donna ha creato un’app a 81 anni - Corriere.it', 'Facebook: ex moderatrice fa causa, traumatizzata da immagini - Hi-tech - ANSA.it', 'Tecnologia - Pagina 3 di 73 - Il Post']
Noise:  ['Tech - Pagina 6 - Wired', 'Redstone 5 sarà Windows 10 October 2018 Update', 'Ecco cosa scatena il sonno incontrollabile nei pazienti narcolettici - Repubblica.it', 'Machine learning - Wired', 'Futuri robot con pelle che sente il vento - Hi-tech - ANSA.it']
DBSCAN finished.

