# Model Testing

## Example of how testing is going to work
Take 3 documents: 2 similar, and a 1 quite different; check if model recognizes this difference,
which results, in the live version, to not group the three articles in the same region

In [5]:
# 3 sample documents (articles) 
data = ["Un incendio è scoppiato, nel tardo pomeriggio di ieri, nella zona delle cave a Bozzano, nel padule di Massarosa. Sul posto sono intervenuti i vigili del fuoco per domare l’incendio. Un odore di bruciato si è levato nella serata dalla zona delle fiamme fino ad invadere Viareggio, tra le proteste dei cittadini.",
       "Spento l'incendio nel Compitese, ora la bonifica Lucca: sotto controllo gli ultimi focolai nella zona di San Leonardo in Treponzio. L'ipotesi più accreditata sull'origine rimane quella del fulmine.  La situazione, nella mattinata di martedì 2 ottobre, ha visto l'incendio praticamente domato, a parte alcuni focolai che comunque non destano preoccupazione. E' così potuta partire l'operazione di bonifica, grazie a due squadre dell'antincendio boschivo. Per quanto riguarda le cause del rogo, partito dalla zona di Pieve di Compito e poi arrivato fino a San Leonardo in treponzio, al momento una delle più accreditate resta quella del fulmine caduto durante il temporale di lunedì notte. Le forze dell'ordine, comunque, non escludono alcuna possibilità e per questo stanno cercando la presenza di eventuali inneschi.",
        "Borse deboli, continua effetto Powell su titoli di Stato. Spread stabile a 280. Le Borse europee aprono la seduta tutte in flessione mentre proseguono le vendite sui titoli di Stato europei in scia a quanto avvenuto ieri dopo le parole del presidente della Fed Powell sulla possibile accelerazione nella stretta monetaria Usa. Piazza Affari perde mezzo punto percentuale nel FTSE MIB mentre gli altri indici continentali mostrano cali lievemente più contenuti (seguine qui l'andamento). In fondo al listino milanese StMicroelectronics, Saipem e Leonardo tra i titoli industriali ma anche il comparto bancario (visto il rendimento del decennale sempre in area 3,35%) con Banco Bpm. Intesa Sanpaolo e Ubi giù di un punto percentuale. Brillano Moncler (+2,2%) e Eni (+0,9%). Ieri intanto il Governo Conte ha rilasciato il Documento di Economia e di Finanza prevedendo una crescita dell'economia italiana all'1,5% nel 2019 e all'1,6% nell'anno successivo. "
       ]

In [6]:
# Doc2Vec imports
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim

# sanitize data
data = [gensim.utils.simple_preprocess(doc) for doc in data]
print(data)

[['un', 'incendio', 'scoppiato', 'nel', 'tardo', 'pomeriggio', 'di', 'ieri', 'nella', 'zona', 'delle', 'cave', 'bozzano', 'nel', 'padule', 'di', 'massarosa', 'sul', 'posto', 'sono', 'intervenuti', 'vigili', 'del', 'fuoco', 'per', 'domare', 'incendio', 'un', 'odore', 'di', 'bruciato', 'si', 'levato', 'nella', 'serata', 'dalla', 'zona', 'delle', 'fiamme', 'fino', 'ad', 'invadere', 'viareggio', 'tra', 'le', 'proteste', 'dei', 'cittadini'], ['spento', 'incendio', 'nel', 'compitese', 'ora', 'la', 'bonifica', 'lucca', 'sotto', 'controllo', 'gli', 'ultimi', 'focolai', 'nella', 'zona', 'di', 'san', 'leonardo', 'in', 'treponzio', 'ipotesi', 'più', 'accreditata', 'sull', 'origine', 'rimane', 'quella', 'del', 'fulmine', 'la', 'situazione', 'nella', 'mattinata', 'di', 'martedì', 'ottobre', 'ha', 'visto', 'incendio', 'praticamente', 'domato', 'parte', 'alcuni', 'focolai', 'che', 'comunque', 'non', 'destano', 'preoccupazione', 'così', 'potuta', 'partire', 'operazione', 'di', 'bonifica', 'grazie', 'd

## Load model and infer vectors from data

In [64]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import  gensim

MODEL_NAME = 'TestModels/d2v_TA_abstract&title0.model'
MODEL_TWO = 'Models/d2v_TA_abstract&title0.model'
model = Doc2Vec.load(MODEL_NAME)
#model = Doc2Vec.load(MODEL_TWO)
inferred_vectors = []


In [None]:
for doc in data:
    # takes a list of strings
    #inferred_vectors.append(np.array(model.infer_vector(doc)).reshape(1, -1))
    inferred_vectors.append(model.infer_vector(doc))

# cosine similarity
print(cosine_similarity([inferred_vectors[0]], [inferred_vectors[1]])) # the similar documents
print(cosine_similarity([inferred_vectors[0]], [inferred_vectors[2]]))
print(cosine_similarity([inferred_vectors[1]], [inferred_vectors[2]]))
print(gensim.models.cosine_similarity([inferred_vectors[0], inferred_vectors[1]]))
# euclidean distance
print(np.linalg.norm(inferred_vectors[0]-inferred_vectors[1])) # the similar documents
print(np.linalg.norm(inferred_vectors[0]-inferred_vectors[2])) 
print(np.linalg.norm(inferred_vectors[1]-inferred_vectors[2])) 

## Load Wikipedia articles test-set
other approach: DBSCAN the vectors inferred from wikipedia

In [8]:
import json

with open('wikipedia_dump.json', 'r') as json_file:
    json_data = json.load(json_file)
# we're expecting a list
assert isinstance(json_data, list)
print("We have {} articles".format(len(json_data)))
titles = []
inferred_vectors = []
similarity_threshold = 0.5

for dictionary in json_data:
    inferred_vectors.append(model.infer_vector(gensim.utils.simple_preprocess(dictionary['abstract'])))
    titles.append(dictionary['title'])
assert len(titles)==len(inferred_vectors)

# for now, let's just print out documents similiraties among them
for i, docvec in enumerate(inferred_vectors):
    for j, docv in enumerate(inferred_vectors):
        # let's just write out the most similar vectors
        sim = cosine_similarity([docvec], [docv])
        if sim>=similarity_threshold and i != j:
            print("{0} is similar to {1} with a score of: {2}\n".format(titles[i], titles[j], sim))

We have 27 articles
Lampone is similar to Ciliegia with a score of: [[0.531212]]

Italia is similar to Berlino with a score of: [[0.57929134]]

Italia is similar to Colosseo with a score of: [[0.538211]]

Italia is similar to Parigi with a score of: [[0.57991326]]

Italia is similar to Bologna with a score of: [[0.5998037]]

Italia is similar to Roma with a score of: [[0.6075661]]

Italia is similar to Milano with a score of: [[0.57625943]]

Berlino is similar to Italia with a score of: [[0.57929134]]

Berlino is similar to Parigi with a score of: [[0.6772896]]

Berlino is similar to Londra with a score of: [[0.58199894]]

Berlino is similar to Bologna with a score of: [[0.5809386]]

Berlino is similar to Roma with a score of: [[0.6819624]]

Berlino is similar to Milano with a score of: [[0.5700284]]

Colosseo is similar to Italia with a score of: [[0.538211]]

Colosseo is similar to New York with a score of: [[0.52936727]]

Colosseo is similar to Parigi with a score of: [[0.5569618]]


# DBSCAN

In [53]:
from sklearn.cluster import DBSCAN

def perform_dbscan(eps = 0.4, min_samples = 4, metric = 'euclidean', algorithm = 'auto', data = None, verbose = True):
    """perform DBSCAN over given data, using given parametrs. Returns dbscan object."""
    db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm).fit(data)

    #print("Core samples: ")
    #for i in db.core_sample_indices_ :
    #    print(titles[i]+"\n")

    # labels will print out the number of the cluster each example belongs to;
    # -1 if the vector is considered noise (not belonging to any cluster)
    #print("Labels: ", db.labels_)
    
    if verbose:
        print("##Clusters##")
        cluster = [[]]
        noise = []
        noise_r = []
        for i, label in enumerate(db.labels_):
            if label != -1:
                try:
                    cluster[label].append(titles[i])
                except Exception as e:
                    cluster.append([titles[i]])
            else:
                noise.append(titles[i])
                noise_r.append(i)
        for list_ in cluster:
            print("Cluster:", list_)
        print("Noise: ", noise)

        print("DBSCAN finished.\n")
    return db

In [29]:
# let's try out different options
#perform_dbscan(eps = 0.3, min_samples = 2, metric = 'euclidean', algorithm = 'auto', data = inferred_vectors)
#perform_dbscan(eps = 0.3, min_samples = 2, metric = 'cosine', algorithm = 'auto', data = inferred_vectors)
perform_dbscan(eps = 0.5, min_samples = 2, metric = 'cosine', algorithm = 'auto', data = inferred_vectors)
#perform_dbscan(eps = 0.2, min_samples = 2, metric = 'euclidean', algorithm = 'auto', data = inferred_vectors)
#perform_dbscan(eps = 0.2, min_samples = 2, metric = 'cosine', algorithm = 'auto', data = inferred_vectors)
perform_dbscan(eps = 0.45, min_samples = 2, metric = 'cosine', algorithm = 'auto', data = inferred_vectors)
perform_dbscan(eps = 0.55, min_samples = 2, metric = 'cosine', algorithm = 'auto', data = inferred_vectors)

##Clusters##
Cluster: ['Lampone', 'Ciliegia', 'Mela', 'Melone', 'Banana']
Cluster: ['Italia', 'Berlino', 'Colosseo', 'New York', 'Parigi', 'Londra', 'Bologna', 'Roma', 'Pizza', 'Milano', 'Quirinale']
Noise:  ['Microsoft', 'Casa Bianca', 'Amazon', 'Instagram', 'Pasta', 'Facebook', 'USA', 'Pera', 'Google', 'Apple', 'Hot dog']
DBSCAN finished.

##Clusters##
Cluster: ['Italia', 'Berlino', 'Colosseo', 'New York', 'Parigi', 'Londra', 'Bologna', 'Roma', 'Milano']
Noise:  ['Microsoft', 'Lampone', 'Casa Bianca', 'Amazon', 'Instagram', 'Ciliegia', 'Mela', 'Pasta', 'Melone', 'Banana', 'Facebook', 'USA', 'Pizza', 'Quirinale', 'Pera', 'Google', 'Apple', 'Hot dog']
DBSCAN finished.

##Clusters##
Cluster: ['Microsoft', 'Amazon', 'Apple']
Cluster: ['Lampone', 'Casa Bianca', 'Italia', 'Berlino', 'Colosseo', 'New York', 'Ciliegia', 'Mela', 'Parigi', 'Londra', 'Bologna', 'Roma', 'Melone', 'Banana', 'USA', 'Pizza', 'Milano', 'Quirinale', 'Hot dog']
Cluster: ['Instagram', 'Facebook']
Noise:  ['Pasta', 'Per

[12, 23, 24]

# Wikipedia 3-documents-test
Test our model with 2 similar documents, and one chosen randomly

In [43]:
# 2 documents are similar if they get a score higher than this threshold
# - based on cosine similarity -
similarity_threshold = 0.5

# load file
import json
with open('wikipedia_3docs_dump.json', 'r') as json_file:
    json_data = json.load(json_file)
# we're expecting a list
assert isinstance(json_data, list)
print("We have {} docs tuples".format(len(json_data)))
titles = []
inferred_vectors = [] # list of lists

# infer each document vector
for dic_list in json_data:
    vectors = []
    for dictionary in dic_list:
        vec = model.infer_vector(gensim.utils.simple_preprocess(dictionary['abstract']))
        vectors.append(vec)
        titles.append(dictionary['title'])
    inferred_vectors.append(vectors)
    
print(titles)

We have 18 docs tuples
['Banana', 'Ciliegia', 'Riccardo Chiarini', 'Acqua', 'Ghiaccio', 'Let There Be Love', 'Fuoco', 'Fiamme', 'Gerónimo de Aguilar', 'Juventus', 'Real Madrid', 'Le tre scimmie', 'NASA', 'Marte', 'Sindaci di Brindisi', 'Vento', 'Uragano', 'Au4', 'Roma', 'Berlino', 'Jean de Locquenghien', 'iPad', 'iPhone', 'Chibly Langlois', 'Apple', 'Microsoft', 'Chiesa di Santo Stefano (Miglieglia)', 'Google', 'Amazon', 'Hauntology', 'Mela', 'Pera', 'Sin gisaeng dyeon', 'Lampone', 'Melone', 'Harrington (Washington)', 'Instagram', 'Facebook', 'Kevin Schmidt', 'Parigi', 'Londra', 'Schiava (vitigno)', 'Milano', 'Bologna', 'Decreto di attuazione degli statuti', 'Marte', 'Venere', 'Scotty 2 Hotty', 'Mark Zuckerberg', 'Facebook', 'Festival da Canção', 'Cometa', 'Stella', 'Miglioramento paretiano']


## Test similarity 

In [44]:
# a model gives a correct answer if it correctly classifies the 2 'linked-document' 
# as similar, and the third one as dissimilar to both
correct = 0
j = 0
for i, linked_docs in enumerate(inferred_vectors):
    print("('{0}, '{1}')---{2}".format(titles[j], titles[j+1], titles[j+2]))
    j = j+3
    cosine_s = cosine_similarity([linked_docs[0]], [linked_docs[1]])
    if cosine_s<similarity_threshold:
        print("Not similar ", cosine_s)
        continue
    cosine_s = cosine_similarity([linked_docs[0]], [linked_docs[2]])
    if cosine_s > similarity_threshold:
        print("Similar 0-2", cosine_s)
        continue
    cosine_s = cosine_similarity([linked_docs[1]], [linked_docs[2]])
    if cosine_s > similarity_threshold:
        print("Similar 1-2", cosine_s)
        continue
    correct = correct + 1
    print("Guessed right!")
    
print("Correct guesses: {0} over {1} examples".format(correct, len(inferred_vectors)))

('Banana, 'Ciliegia')---Riccardo Chiarini
Similar 1-2 [[0.5008852]]
('Acqua, 'Ghiaccio')---Let There Be Love
Guessed right!
('Fuoco, 'Fiamme')---Gerónimo de Aguilar
Similar 0-2 [[0.5075566]]
('Juventus, 'Real Madrid')---Le tre scimmie
Guessed right!
('NASA, 'Marte')---Sindaci di Brindisi
Not similar  [[0.44681236]]
('Vento, 'Uragano')---Au4
Not similar  [[0.4374466]]
('Roma, 'Berlino')---Jean de Locquenghien
Guessed right!
('iPad, 'iPhone')---Chibly Langlois
Guessed right!
('Apple, 'Microsoft')---Chiesa di Santo Stefano (Miglieglia)
Guessed right!
('Google, 'Amazon')---Hauntology
Not similar  [[0.3489512]]
('Mela, 'Pera')---Sin gisaeng dyeon
Guessed right!
('Lampone, 'Melone')---Harrington (Washington)
Not similar  [[0.42072335]]
('Instagram, 'Facebook')---Kevin Schmidt
Not similar  [[0.42628554]]
('Parigi, 'Londra')---Schiava (vitigno)
Guessed right!
('Milano, 'Bologna')---Decreto di attuazione degli statuti
Guessed right!
('Marte, 'Venere')---Scotty 2 Hotty
Guessed right!
('Mark Zuck

# Hold-out test corpus Clustering and Visualization
kind of a live-simulation

In [65]:
## load test-corpus
import json
import gensim

with open('TOWL_test_corpus.json', 'r') as json_file:
    json_data = json.load(json_file)
# we're expecting a list
assert isinstance(json_data, list)
titles = [dictionary['title'] for dictionary in json_data]
test_corpus = [gensim.utils.simple_preprocess(d['title']+d['abstract']) for d in json_data]
print("Number of documents: ", len(test_corpus))
#print(test_corpus)

Number of documents:  200


In [72]:
# for each document in the test corpus, infer a vector
inferred_vectors = [model.infer_vector(test_doc) for test_doc in test_corpus]
# and perform db scan
db = perform_dbscan(eps = 0.25, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = inferred_vectors, verbose = False)

## Data visualizing using PCA

In [73]:
# PCA imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# loading dataset into Pandas DataFrame
df = pd.DataFrame.from_records(inferred_vectors)
# PCA is effected by scale so you need to scale the features in your data before applying PCA. 
vec_size = 100
features = [i for i in range(vec_size)]

x = df.loc[:, features].values # get features values
# standardize data
x = StandardScaler().fit_transform(x) # scale data (especially in case different measures are used)
# build PCA model in 2D
pca = PCA(n_components=2) # The new components are just the two main dimensions of variation.

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf.head()
# these components drawn don't hold a lot of information 'per-se', they're just the result 
# of dimension-reduction

finalDf = principalDf 


In [74]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go
from scipy.spatial import distance

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

COMPONENT_ONE = "principal component 1"
COMPONENT_TWO = "principal component 2"

titles = [dictionary['title'] for dictionary in json_data]
traces = []
clusters_indices = db.labels_

assert len(finalDf)==len(clusters_indices)
# each trace will represent a point (squeezed vector from higher dimensions),
# and each point will have the title of the news assigned
for i in range(len(finalDf)):
    x , y = finalDf.iat[i, 0], finalDf.iat[i, 1]
    color = 'rgba(0, 0, 180, 0.8)'
    
    # print colors according to cluster
    if clusters_indices[i]==0:
        color = 'red'
    elif clusters_indices[i]==1:
        color = 'pink'
    elif clusters_indices[i]==2:
        color = 'yellow'
    elif clusters_indices[i]==3:
        color = 'blue'
    elif clusters_indices[i]==4:
        color = 'violet'
    elif clusters_indices[i]==-1:
        color = 'green'
    else:
        color = 'black'
    
    trace0 = go.Scatter(
        x = [x], 
        y = [y],
        mode = 'markers',
            #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

data = traces 
layout = dict(title = 'PCA Representantion of Test Data with DBSCAN',
            hovermode= 'closest',
            xaxis= dict(
                title= 'first component',
                ticklen= 5,
                gridwidth= 2,
            ),
            yaxis=dict(
                title= 'second component',
                ticklen= 5,
                gridwidth= 2,
            ),
            showlegend = False
        )
# Plot and embed in ipython notebook!
    
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TOWL_model_testing')