# Model Testing

## Example of how testing is going to work
Take 3 documents: 2 similar, and a 1 quite different; check if model recognizes this difference,
which results, in the live version, to not group the three articles in the same region

## Load model and infer vectors from data

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import  gensim

MODEL_NAME = 'TestModels/d2v_TA_abstract&title0.model'
MODEL_TWO = 'Models/d2v_TA_abstract&title0.model'
#model = Doc2Vec.load(MODEL_NAME)
model = Doc2Vec.load(MODEL_TWO)
inferred_vectors = []
# print out dimension of the vocabulary 
print(len(model.wv.vocab))
#print(model.most_similar(positive=['re', 'donna'], negative=['uomo']))

5923


## Load Wikipedia articles test-set
other approach: DBSCAN the vectors inferred from wikipedia

In [3]:
import json

with open('wikipedia_dump.json', 'r') as json_file:
    json_data = json.load(json_file)
# we're expecting a list
assert isinstance(json_data, list)
print("We have {} articles".format(len(json_data)))
titles = []
inferred_vectors = []
similarity_threshold = 0.5

for dictionary in json_data:
    inferred_vectors.append(model.infer_vector(gensim.utils.simple_preprocess(dictionary['abstract'])))
    titles.append(dictionary['title'])
assert len(titles)==len(inferred_vectors)

# for now, let's just print out documents similiraties among them
for i, docvec in enumerate(inferred_vectors):
    for j, docv in enumerate(inferred_vectors):
        # let's just write out the most similar vectors
        sim = cosine_similarity([docvec], [docv])
        if sim>=similarity_threshold and i != j:
            #print("{0} is similar to {1} with a score of: {2}\n".format(titles[i], titles[j], sim))

We have 27 articles
Italia is similar to Berlino with a score of: [[0.52789956]]

Italia is similar to Parigi with a score of: [[0.55656326]]

Italia is similar to Bologna with a score of: [[0.5959468]]

Italia is similar to Roma with a score of: [[0.5449377]]

Italia is similar to Milano with a score of: [[0.5863396]]

Berlino is similar to Italia with a score of: [[0.52789956]]

Berlino is similar to Parigi with a score of: [[0.6624329]]

Berlino is similar to Londra with a score of: [[0.54691124]]

Berlino is similar to Bologna with a score of: [[0.5238744]]

Berlino is similar to Roma with a score of: [[0.5870053]]

Berlino is similar to USA with a score of: [[0.51547414]]

Berlino is similar to Milano with a score of: [[0.57719976]]

Berlino is similar to Quirinale with a score of: [[0.5933477]]

Colosseo is similar to New York with a score of: [[0.59265566]]

Colosseo is similar to Parigi with a score of: [[0.5562386]]

Colosseo is similar to Londra with a score of: [[0.5239558]]

# DBSCAN

In [2]:
# my function for performing dbscan and printing out cluster results
def perform_dbscan(eps = 0.4, min_samples = 4, metric = 'euclidean', algorithm = 'auto', data = None, verbose = True
                  , titles = None, print_noise = True):
    """perform DBSCAN over given data, using given parametrs. Returns dbscan object and clusters dictionary."""
    
    db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm).fit(data)

    # labels will print out the number of the cluster each example belongs to;
    # -1 if the vector is considered noise (not belonging to any cluster)
    #print("Labels: ", db.labels_)

    # create data structure containing clusters
    clusters_to_ret = {label:[] for label in db.labels_ if label!=-1}
    
    for i, label in enumerate(db.labels_):
        if label != -1: #ignore noise points
            clusters_to_ret[label].append(urls[i])
        
    
    
    # only do this if you need to print out the result (messy for large number of docs)
    if verbose:
        print("##Clusters##")
        clusters = {label: [] for label in db.labels_ if label!=-1}
        noise = []
        for i, label in enumerate(db.labels_):
            if label != -1: 
                clusters[label].append(titles[i])
            else: # save noise points
                noise.append(titles[i])
                
        for label, list_ in clusters.items():
            print("Cluster {0}: {1}".format(label, list_))
        if print_noise:
            print("Noise: ", noise)

        print("DBSCAN finished.\n")
    return db, clusters_to_ret

# DBSCAN incremental version

In [11]:
# DBSCAN the mini-set of docs from wikipedia
eps = 0.4
eps_increment = .15
db = perform_dbscan(eps = eps, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = inferred_vectors, verbose = True, titles = titles)
noise_data = [inferred_vectors[i] for i, label in enumerate(db.labels_) if label==-1]
noise_titles = [titles[i] for i, label in enumerate(db.labels_) if label==-1]
db = perform_dbscan(eps = eps + eps_increment, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = noise_data, verbose = True, titles = noise_titles)
print('Done')
# let's try and find other clusters in the noise data, with higher eps
noise_data = [inferred_vectors[i] for i, label in enumerate(db.labels_) if label==-1]
noise_titles = [titles[i] for i, label in enumerate(db.labels_) if label==-1]
db = perform_dbscan(eps = eps + eps_increment, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = noise_data, verbose = True, titles = noise_titles)

##Clusters##
Cluster: ['Berlino', 'Parigi']
Cluster: ['Bologna', 'Roma', 'Milano']
Noise:  ['Microsoft', 'Lampone', 'Casa Bianca', 'Italia', 'Amazon', 'Colosseo', 'New York', 'Instagram', 'Ciliegia', 'Mela', 'Pasta', 'Londra', 'Melone', 'Banana', 'Facebook', 'USA', 'Pizza', 'Quirinale', 'Pera', 'Google', 'Apple', 'Hot dog']
DBSCAN finished.

##Clusters##
Cluster: ['Microsoft', 'Apple']
Cluster: ['Lampone', 'Ciliegia', 'Mela', 'Melone', 'Banana', 'Pera']
Cluster: ['Casa Bianca', 'Italia', 'Colosseo', 'New York', 'Londra', 'USA', 'Pizza', 'Quirinale']
Cluster: ['Instagram', 'Facebook', 'Google']
Noise:  ['Amazon', 'Pasta', 'Hot dog']
DBSCAN finished.



In [4]:

# let's try out different options

#perform_dbscan(eps = 0.3, min_samples = 2, metric = 'euclidean', algorithm = 'auto', data = inferred_vectors)
#perform_dbscan(eps = 0.3, min_samples = 2, metric = 'cosine', algorithm = 'auto', data = inferred_vectors)
perform_dbscan(eps = 0.5, min_samples = 2, metric = 'cosine', algorithm = 'auto', data = inferred_vectors)
#perform_dbscan(eps = 0.2, min_samples = 2, metric = 'euclidean', algorithm = 'auto', data = inferred_vectors)
#perform_dbscan(eps = 0.2, min_samples = 2, metric = 'cosine', algorithm = 'auto', data = inferred_vectors)
perform_dbscan(eps = 0.45, min_samples = 2, metric = 'cosine', algorithm = 'auto', data = inferred_vectors)
perform_dbscan(eps = 0.55, min_samples = 2, metric = 'cosine', algorithm = 'auto', data = inferred_vectors)

##Clusters##


TypeError: 'NoneType' object is not subscriptable

# Wikipedia 3-documents-test
Test our model with 2 similar documents, and one chosen randomly

In [24]:
# 2 documents are similar if they get a score higher than this threshold
# - based on cosine similarity -
similarity_threshold = 0.4

# load file
import json
with open('wikipedia_3docs_dump.json', 'r') as json_file:
    json_data = json.load(json_file)
# we're expecting a list
assert isinstance(json_data, list)
print("We have {} docs tuples".format(len(json_data)))
titles = []
inferred_vectors = [] # list of lists

# infer each document vector
for dic_list in json_data:
    vectors = []
    for dictionary in dic_list:
        vec = model.infer_vector(gensim.utils.simple_preprocess(dictionary['abstract']))
        vectors.append(vec)
        titles.append(dictionary['title'])
    inferred_vectors.append(vectors)
    
#print(inferred_vectors[:1])

We have 81 docs tuples


## Test similarity 

In [25]:
# a model gives a correct answer if it correctly classifies the 2 'linked-document' 
# as similar, and the third one as dissimilar to both
correct = 0
j = 0
for i, linked_docs in enumerate(inferred_vectors):
    print("('{0}, '{1}')---{2}".format(titles[j], titles[j+1], titles[j+2]))
    j = j+3
    cosine_s = cosine_similarity([linked_docs[0]], [linked_docs[1]])
    if cosine_s<similarity_threshold:
        print("Not similar ", cosine_s)
        continue
    cosine_s = cosine_similarity([linked_docs[0]], [linked_docs[2]])
    if cosine_s > similarity_threshold:
        print("Similar 0-2", cosine_s)
        continue
    cosine_s = cosine_similarity([linked_docs[1]], [linked_docs[2]])
    if cosine_s > similarity_threshold:
        print("Similar 1-2", cosine_s)
        continue
    correct = correct + 1
    print("Guessed right!")
    
print("Correct guesses: {0} over {1} examples".format(correct, len(inferred_vectors)))

('Infante, 'Figlio')---Argentina Open 2016 - Doppio
Guessed right!
('Uomo, 'Donna')---Emicrania emiplegica familiare
Not similar  [[0.22392115]]
('Samsung, 'LG')---Piazza Carlo III (Napoli)
Guessed right!
('Milano, 'Bologna')---Maria Elena Vandone
Similar 0-2 [[0.4779811]]
('Re, 'Principe')---Candia (unità periferica)
Guessed right!
('Roma, 'Berlino')---Qawmī Tarāna
Similar 0-2 [[0.47933146]]
('Armi, 'Pistola')---Otov
Not similar  [[0.37634563]]
('Brodo, 'Zuppa')---Babette von Bülow
Guessed right!
('Incendio, 'Bruciare')---Criptomorfismo
Not similar  [[0.14342284]]
('Questione, 'Domanda')---Calzino
Not similar  [[0.29680198]]
('iPad, 'iPhone')---Leggi sulla sodomia negli Stati Uniti d'America
Guessed right!
('Chino, 'Curvo')---Battaglia di Thouars
Not similar  [[0.13092655]]
('Regina, 'Principessa')---Aeroporto di Rževka
Not similar  [[0.33889288]]
('Ghepardo, 'Tigre')---Christian Georg Schmorl
Similar 1-2 [[0.42288837]]
('Cadere, 'Cascare')---Edward Fletcher
Similar 0-2 [[0.4400044]]


# Hold-out test corpus Clustering and Visualization
kind of a live-simulation

In [1]:
## load test-corpus
import json
import gensim

with open('TOWL_test_corpus.json', 'r') as json_file:
    json_data = json.load(json_file)
# we're expecting a list
assert isinstance(json_data, list)
titles = [dictionary['title'] for dictionary in json_data]
test_corpus = [gensim.utils.simple_preprocess(d['title']+d['abstract']) for d in json_data]
#test_corpus = [gensim.utils.simple_preprocess(d['flattened_entities']) for d in json_data]
print("Number of documents: ", len(test_corpus))
#print(test_corpus)

Number of documents:  200


In [7]:
# for each document in the test corpus, infer a vector
inferred_vectors = [model.infer_vector(test_doc) for test_doc in test_corpus]
# and perform db scan
db = perform_dbscan(eps = 0.48, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = inferred_vectors, verbose = False, titles = titles)
print(db.labels_)

[-1 -1  0 -1 -1 -1 -1 -1 -1  1  2 -1  3 -1  4  5 -1 -1 -1  6 -1  7 -1 -1
  4  4 -1 -1  0 -1 -1 -1  5 -1 -1 -1 -1 -1 -1  5  7 -1 -1  2  8 -1 -1 -1
 -1  1 -1 -1  5  5 -1  5 -1 -1 -1 -1  9 -1 -1 -1 -1 -1 10 11 -1 -1 -1 -1
  5 -1 -1  7 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 -1 -1  5 -1 -1 -1 -1
 -1 -1 -1 12 -1 13  3 -1  4 -1 14 -1 -1  4 -1 -1  5 -1 -1 -1 -1 -1  7 -1
 12 -1 -1 -1 -1  4 -1 -1 -1 -1 -1  5  8 12  5 -1 13  5 14 -1 11 -1 -1 -1
  8 -1 -1 -1 -1 -1 -1 -1 -1  5 13 -1 -1 -1 -1 -1 -1 -1 -1  3 -1 -1 13  4
 15  9 -1 15 -1  4 -1  5 -1 -1 -1 -1 -1 -1 -1 -1  2 -1 -1 -1  5  4  3 -1
 -1  5  6 -1 -1 -1 13 -1]


In [43]:
# we have a lot of noise points; let's try to dbscan those points again, with a higher eps
#print(cosine_similarity([inferred_vectors[0]], [inferred_vectors[3]]))
# create a list of noise inferred vectors
noise = [inferred_vectors[i] for i, index in enumerate(db.labels_) if index==-1]
db = perform_dbscan(eps = 0.52, min_samples = 3, metric = 'cosine', algorithm = 'brute',
                    data = inferred_vectors, verbose = False)
#print(db.labels_)

[-1 -1  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0 -1  0 -1  0  0  0 -1
  0  0  0  1 -1  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0
  0  0  0  0  0  0 -1  0  0 -1  0  0  0  0  0  0  0 -1  0 -1  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0 -1  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  1  0  0  0  0  0
  0  0 -1 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0
  1  0  0 -1  0  0 -1  0  0  0 -1  0  0  0  0 -1  0  0  0  0  0  0  0 -1
  0  0  0  0  0  0  0  0]


In [32]:
from sklearn.metrics.pairwise import pairwise_distances

db = perform_dbscan(eps = 0.25, min_samples = 2, metric = 'cosine', algorithm = 'brute',
                    data = pairwise_distances(inferred_vectors, metric='manhattan'), verbose = False)

## Data visualizing using PCA

In [48]:
# PCA imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# loading dataset into Pandas DataFrame
df = pd.DataFrame.from_records(inferred_vectors)
# PCA is effected by scale so you need to scale the features in your data before applying PCA. 
vec_size = 100
features = [i for i in range(vec_size)]

x = df.loc[:, features].values # get features values
# standardize data
x = StandardScaler().fit_transform(x) # scale data (especially in case different measures are used)
# build PCA model in 2D
pca = PCA(n_components=2) # The new components are just the two main dimensions of variation.

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf.head()
# these components drawn don't hold a lot of information 'per-se', they're just the result 
# of dimension-reduction

finalDf = principalDf 


In [50]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go
from scipy.spatial import distance

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

COMPONENT_ONE = "principal component 1"
COMPONENT_TWO = "principal component 2"

traces = []
clusters_indices = db.labels_

assert len(finalDf)==len(clusters_indices)
# each trace will represent a point (squeezed vector from higher dimensions),
# and each point will have the title of the news assigned
for i in range(len(finalDf)):
    x , y = finalDf.iat[i, 0], finalDf.iat[i, 1]
    color = 'rgba(0, 0, 180, 0.8)'
    """
    # print colors according to cluster
    if clusters_indices[i]==-1:
        color = 'black'
    else:
        red = 10 * clusters_indices[i]
        color = 'rgba({}, 0, 120, .9)'.format(red)
    """
    if clusters_indices[i]==0:
        color = 'red'
    elif clusters_indices[i]==1:
        color = 'pink'
    elif clusters_indices[i]==2:
        color = 'yellow'
    elif clusters_indices[i]==3:
        color = 'blue'
    elif clusters_indices[i]==4:
        color = 'violet'
    elif clusters_indices[i]==5:
        color = 'rgba(100, 100, 100, 1)'
    elif clusters_indices[i]==-1:
        color = 'green'
    else:
        color = 'black'
    
    trace0 = go.Scatter(
        x = [x], 
        y = [y],
        mode = 'markers',
            #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

data = traces 
layout = dict(title = 'PCA Representantion of Test Data with DBSCAN',
            hovermode= 'closest',
            xaxis= dict(
                title= 'first component',
                ticklen= 5,
                gridwidth= 2,
            ),
            yaxis=dict(
                title= 'second component',
                ticklen= 5,
                gridwidth= 2,
            ),
            showlegend = False
        )
# Plot and embed in ipython notebook!
    
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='TOWL_model_testing')

## Clustering of small test sub-samples

In [8]:
# let's take a few documents, randomly chosen from the inferred vectors
import random
subsample_size = 25
subsamples = []
titles2 = []
for i in range(subsample_size):
    index = random.randint(0, len(inferred_vectors)-1)
    subsamples.append(inferred_vectors[index])
    titles2.append(titles[index])
    
db = perform_dbscan(eps = 0.48, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = subsamples, verbose = True, titles = titles2)

##Clusters##
Cluster: ['3 Italia, la rete dati non funziona. Ecco cosa sta succedendo - Wired', '3 Italia, la rete dati non funziona. Ecco cosa sta succedendo - Wired']
Cluster: ['Realtà virtuale: il mercato a un punto di svolta', 'Realtà virtuale: il mercato a un punto di svolta']
Noise:  ['I capelli hanno fiuto, molecole odorose li aiutano a crescere - Repubblica.it', 'Ecco perché WhatsApp non funzionava (e se fosse colpa della Juve?) - Corriere.it', 'Intrattenimento', 'Un bug sta paralizzando le poste francesi - Wired', 'Tech: tutti i contenuti per gallery - Pagina 5 - Wired', 'Marchesan, un\'italiana tra gli scienziati emergenti nel mondo: "La sfida più grande? Le differenze di genere" - Repubblica.it', 'Asimo va in pensione, quale sarà il futuro dei robot umanoidi? - Wired', 'La navigazione in incognito è meno sicura di quanto si creda - Repubblica.it', 'Your Phone: dallo smartphone a Windows 10', "Fb, 'war room' in vista voto Usa e Ue - Internet e Social - ANSA.it", 'Microsoft an