In [92]:
import time
import pickle
import random
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

from gensim.models import doc2vec

from crashsimilarity import other_distances as od
from crashsimilarity import tmp
from crashsimilarity.generator import Generator
from crashsimilarity.distance import DistanceCalculator

from sklearn import metrics
from sklearn.metrics import pairwise

In [2]:
vocab = pickle.load(open('data/compressed_vocab.pickle', 'rb'))
pos2vocab = dict([(i[1], i[0]) for i in vocab.items()])
len(vocab)

222895

In [3]:
model = doc2vec.Doc2Vec.load('data/new-model/model_dm_200_30_days.model')
model

INFO:gensim.utils:loading Doc2Vec object from data/new-model/model_dm_200_30_days.model
INFO:gensim.utils:loading wv recursively from data/new-model/model_dm_200_30_days.model.wv.* with mmap=None
INFO:gensim.utils:loading syn0 from data/new-model/model_dm_200_30_days.model.wv.syn0.npy with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:loading docvecs recursively from data/new-model/model_dm_200_30_days.model.docvecs.* with mmap=None
INFO:gensim.utils:loading doctag_syn0 from data/new-model/model_dm_200_30_days.model.docvecs.doctag_syn0.npy with mmap=None
INFO:gensim.utils:loading syn1neg from data/new-model/model_dm_200_30_days.model.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:loaded data/new-model/model_dm_200_30_days.model


<gensim.models.doc2vec.Doc2Vec at 0x7f8c339b3630>

In [4]:
multiple_sig = pickle.load(open('data/multiple_sig_last_year.pickle', 'rb'))
prepared = [sig['prepared'] for sig in multiple_sig]
len(multiple_sig)

261

In [5]:
corpus = []
for p in prepared:
    for t in p:
        corpus.append(t)
candidates = []
for c in corpus:
    if any(['@0x' in t for t in c]):
        continue
    qq = set(c)
    if len(qq) > len(c) / 2.0 and len(c) > 10 and c not in candidates:
        candidates.append(c)
len(candidates)

221

In [6]:
random.shuffle(candidates)

In [7]:
centers = 50
generator = Generator(model, vocab, pos2vocab)
clusters = []
for i in range(centers):
    cnt = random.randint(4, 7)
    clusters.append(generator.generate(candidates[i], cnt))

INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


In [84]:
groups = []
corpus = []
true_labels = []
for p in clusters:
    group = []
    for t in p:
        corpus.append(t)
        group.append(len(corpus)-1)
        true_labels.append(len(groups))
    groups.append(group)
len(groups), len(corpus)

(50, 266)

In [11]:
compressed_corpus = od.compressed_tagged_corpus(corpus, vocab)

In [15]:
dist, w2pos =  DistanceCalculator.words_distance(compressed_corpus, model)
distance_calculator = DistanceCalculator(model, w2pos, dist)
pos2w = dict([(i[1], i[0]) for i in w2pos.items()])
pos2vocab = dict([(i[1], i[0]) for i in vocab.items()])

INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


len(words) = 895


In [16]:
wmd = od.distance_matrix(compressed_corpus, distance_calculator.fast_wmd)

9.998581359058022%, 11.079784154891968 s.
19.997162718116044%, 22.78269910812378 s.
29.99574407717407%, 34.30571269989014 s.
39.99432543623209%, 49.44223165512085 s.
49.99290679529011%, 59.97441649436951 s.
59.99148815434814%, 68.0391013622284 s.
69.99006951340616%, 81.2457709312439 s.
79.98865087246418%, 91.76011514663696 s.
89.9872322315222%, 103.8180763721466 s.
99.98581359058022%, 115.58069109916687 s.


In [18]:
editd = od.distance_matrix(compressed_corpus, od.edit_distance2)

9.998581359058022%, 2.588167905807495 s.
19.997162718116044%, 5.440074682235718 s.
29.99574407717407%, 8.47719144821167 s.
39.99432543623209%, 12.176542282104492 s.
49.99290679529011%, 14.555184602737427 s.
59.99148815434814%, 16.225221157073975 s.
69.99006951340616%, 19.224300146102905 s.
79.98865087246418%, 21.647841215133667 s.
89.9872322315222%, 24.463589668273926 s.
99.98581359058022%, 27.16572380065918 s.


In [22]:
editd.shape

(266, 266)

In [21]:
paragraph_vec = [model.infer_vector(i.words) for i in compressed_corpus]
paragraphd = pairwise.euclidean_distances(paragraph_vec)
paragraphd.shape

(266, 266)

In [23]:
def semantic_dist(s1, s2):
    return od.edit_distance2(s1, s2, subst_cost=distance_calculator.compressed_words_dist)
eidt_semantic_d = od.distance_matrix(compressed_corpus, semantic_dist)

9.998581359058022%, 15.137603282928467 s.
19.997162718116044%, 32.273330211639404 s.
29.99574407717407%, 50.40892839431763 s.
39.99432543623209%, 72.46771383285522 s.
49.99290679529011%, 86.77981281280518 s.
59.99148815434814%, 96.73048949241638 s.
69.99006951340616%, 114.84463334083557 s.
79.98865087246418%, 129.67566847801208 s.
89.9872322315222%, 146.49566459655762 s.
99.98581359058022%, 162.23114609718323 s.


In [105]:
results = dict()

In [106]:
results['wmd'] =  od.dbscan(wmd,eps=0.4, min_samples=4)
results['wmd'][1]

46

In [107]:
results['editd'] = od.dbscan(editd, eps=11, min_samples=4)
results['editd'][1]

25

In [None]:
# results['edit_struct_d'] = od.

In [108]:
results['paragraphd'] = od.dbscan(paragraphd, eps = 1.4, min_samples=4)
results['paragraphd'][1]

14

In [109]:
results['eidt_semantic_d'] = od.dbscan(eidt_semantic_d, eps=11, min_samples=4)
results['eidt_semantic_d'][1]

44

In [110]:
def predicted_for_group(group, predicted):
    rv = defaultdict(int)
    for g in group:
        for c, points in predicted.items():
            if g in points:
                rv[c] += 1
    return rv

def calc_accuracy(predicted, true):
    total = sum([len(i) for i in predicted.values()])     
    good = sum([max(predicted_for_group(g, predicted).values() or [0]) for g in true])
    return good / float(total)

In [111]:
extended = dict()
for k, v in results.items():
    x = dict()
    x['labels'] = v[0]
    x['noise'] = len([i for i in v[0] if i == -1])
    x['clusters_n'] = v[1]
    x['clusters'] = od.labels_to_clusters(v[0])
    x['accuracy'] = calc_accuracy(x['clusters'], groups)
    x['homogeneity'] = metrics.homogeneity_score(true_labels, v[0])
    x['completeness'] = metrics.completeness_score(true_labels, v[0])
    x['fowlkes_mallows_score'] = metrics.fowlkes_mallows_score(true_labels, v[0])
    extended[k] = x

In [113]:
for k, v in extended.items():
    print(k, v['noise'], v['accuracy'], v['homogeneity'], v['completeness'], v['fowlkes_mallows_score'])

wmd 0 1.0 0.967790555983 1.0 0.892954881868
editd 97 1.0 0.655190461119 0.982837203807 0.318721687051
paragraphd 101 0.9878787878787879 0.418371028308 0.938122116128 0.224627877806
eidt_semantic_d 4 1.0 0.960108677984 1.0 0.873627026459
