In [32]:
import time
import pickle
import random
import math
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

from gensim.models import doc2vec

from crashsimilarity import other_distances as od
from crashsimilarity import tmp
from crashsimilarity.generator import Generator
from crashsimilarity.distance import DistanceCalculator

from sklearn import metrics
from sklearn.metrics import pairwise

In [2]:
vocab = pickle.load(open('data/compressed_vocab.pickle', 'rb'))
pos2vocab = dict([(i[1], i[0]) for i in vocab.items()])
len(vocab)

222895

In [3]:
model = doc2vec.Doc2Vec.load('data/new-model/model_dm_200_30_days.model')
model

INFO:gensim.utils:loading Doc2Vec object from data/new-model/model_dm_200_30_days.model
INFO:gensim.utils:loading wv recursively from data/new-model/model_dm_200_30_days.model.wv.* with mmap=None
INFO:gensim.utils:loading syn0 from data/new-model/model_dm_200_30_days.model.wv.syn0.npy with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:loading docvecs recursively from data/new-model/model_dm_200_30_days.model.docvecs.* with mmap=None
INFO:gensim.utils:loading doctag_syn0 from data/new-model/model_dm_200_30_days.model.docvecs.doctag_syn0.npy with mmap=None
INFO:gensim.utils:loading syn1neg from data/new-model/model_dm_200_30_days.model.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:loaded data/new-model/model_dm_200_30_days.model


<gensim.models.doc2vec.Doc2Vec at 0x7f715f3104a8>

In [4]:
multiple_sig = pickle.load(open('data/multiple_sig_last_year.pickle', 'rb'))
prepared = [sig['prepared'] for sig in multiple_sig]
len(multiple_sig)

261

In [5]:
corpus = []
for p in prepared:
    for t in p:
        corpus.append(t)
candidates = []
for c in corpus:
    if any(['@0x' in t for t in c]):
        continue
    qq = set(c)
    if len(qq) > len(c) / 2.0 and len(c) > 10 and c not in candidates:
        candidates.append(c)
len(candidates)

221

In [7]:
random.shuffle(candidates)

In [8]:
len(candidates)

221

In [9]:
centers = 50
generator = Generator(model, vocab, pos2vocab)
clusters = []
for i in range(centers):
    cnt = random.randint(4, 7)
    clusters.append(generator.generate(candidates[i], cnt))

INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


In [10]:
groups = []
corpus = []
true_labels = []
for p in clusters:
    group = []
    for t in p:
        corpus.append(t)
        group.append(len(corpus)-1)
        true_labels.append(len(groups))
    groups.append(group)
len(groups), len(corpus)

(50, 270)

In [11]:
compressed_corpus = od.compressed_tagged_corpus(corpus, vocab)

In [12]:
dist, w2pos =  DistanceCalculator.words_distance(compressed_corpus, model)
distance_calculator = DistanceCalculator(model, w2pos, dist)
pos2w = dict([(i[1], i[0]) for i in w2pos.items()])
pos2vocab = dict([(i[1], i[0]) for i in vocab.items()])

INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


len(words) = 955


In [13]:
wmd = od.distance_matrix(compressed_corpus, distance_calculator.fast_wmd)

9.998623158474459%, 16.898571491241455 s.
19.997246316948917%, 31.74764132499695 s.
29.995869475423376%, 45.916672229766846 s.
39.994492633897835%, 56.890480756759644 s.
49.99311579237229%, 74.57357168197632 s.
59.99173895084675%, 90.20590877532959 s.
69.99036210932121%, 102.02522373199463 s.
79.98898526779567%, 114.31541061401367 s.
89.98760842627013%, 125.70089912414551 s.
99.98623158474459%, 135.86778044700623 s.


In [14]:
editd = od.distance_matrix(compressed_corpus, od.edit_distance2)

9.998623158474459%, 3.380789041519165 s.
19.997246316948917%, 6.315523147583008 s.
29.995869475423376%, 9.061351537704468 s.
39.994492633897835%, 11.08815050125122 s.
49.99311579237229%, 14.399194478988647 s.
59.99173895084675%, 17.82077932357788 s.
69.99036210932121%, 20.33037567138672 s.
79.98898526779567%, 23.193928241729736 s.
89.98760842627013%, 25.684646368026733 s.
99.98623158474459%, 27.95629620552063 s.


In [15]:
editd.shape

(270, 270)

In [16]:
paragraph_vec = [model.infer_vector(i.words) for i in compressed_corpus]
paragraphd = pairwise.euclidean_distances(paragraph_vec)
paragraphd.shape

(270, 270)

In [17]:
def semantic_dist(s1, s2):
    return od.edit_distance2(s1, s2, subst_cost=distance_calculator.compressed_words_dist)
eidt_semantic_d = od.distance_matrix(compressed_corpus, semantic_dist)

9.998623158474459%, 13.883564472198486 s.
19.997246316948917%, 25.795589447021484 s.
29.995869475423376%, 36.806708097457886 s.
39.994492633897835%, 45.530301094055176 s.
49.99311579237229%, 59.40377759933472 s.
59.99173895084675%, 73.67054176330566 s.
69.99036210932121%, 83.83214068412781 s.
79.98898526779567%, 96.52167773246765 s.
89.98760842627013%, 106.95459794998169 s.
99.98623158474459%, 116.74296975135803 s.


In [18]:
results = dict()

In [19]:
results['wmd'] =  od.dbscan(wmd,eps=0.4, min_samples=4)
results['wmd'][1]

48

In [20]:
results['editd'] = od.dbscan(editd, eps=11, min_samples=4)
results['editd'][1]

27

In [None]:
# results['edit_struct_d'] = od.

In [41]:
results['paragraphd'] = od.dbscan(paragraphd, eps = 1.4, min_samples=4)
results['paragraphd'][1]

14

In [42]:
results['eidt_semantic_d'] = od.dbscan(eidt_semantic_d, eps=11, min_samples=4)
results['eidt_semantic_d'][1]

44

In [34]:
def true_positive(labels_true, labels_pred):
    ans = set()
    for i, a in enumerate(labels_true):
        for j, b in enumerate(labels_true):
            if j <= i:
                continue
            if a == b and labels_pred[i] == labels_pred[j] and labels_pred[i] != -1:
                ans.add((min(i,j), max(i,j)))
    return len(ans)

def false_positive(labels_true, labels_pred):
    ans = set()
    for i, a in enumerate(labels_true):
        for j, b in enumerate(labels_true):
            if j <= i:
                continue
            if a == b and labels_pred[i] != labels_pred[j] and labels_pred[i] != -1:
                ans.add((min(i,j), max(i,j)))
    return len(ans)
    
def false_negative(labels_true, labels_pred):
    ans = set()
    for i, a in enumerate(labels_true):
        for j, b in enumerate(labels_true):
            if j <= i:
                continue
            if a != b and labels_pred[i] == labels_pred[j] and labels_pred[i] != -1:
                ans.add((min(i,j), max(i,j)))
    return len(ans)

def my_precision(labels_true, labels_pred):
    return float(true_positive(labels_true, labels_pred)) / (true_positive(labels_true, labels_pred) + false_positive(labels_true, labels_pred))

def my_recall(labels_true, labels_pred):
    return float(true_positive(labels_true, labels_pred)) / (true_positive(labels_true, labels_pred) + false_negative(labels_true, labels_pred))

def my_FMI(labels_pred, labels_true):
    tp = true_positive(labels_pred, labels_true)
    fp = false_positive(labels_pred, labels_true)
    fn = false_negative(labels_pred, labels_true)
    return float(tp) / math.sqrt((tp+fp) * (tp+fn))

def purity(labels_pred, labels_true):
    

In [38]:
def predicted_for_group(group, predicted):
    rv = defaultdict(int)
    for g in group:
        for c, points in predicted.items():
            if g in points:
                rv[c] += 1
    return rv

def calc_accuracy(predicted, true):
    total = sum([len(i) for i in predicted.values()])     
    good = sum([max(predicted_for_group(g, predicted).values() or [0]) for g in true])
    return good / float(total)

In [50]:
extended = dict()
for k, v in results.items():
    x = dict()
    x['labels'] = v[0]
    x['noise'] = len([i for i in v[0] if i == -1])
    x['clusters_n'] = v[1]
    x['clusters'] = od.labels_to_clusters(v[0])
    # x['accuracy'] = calc_accuracy(x['clusters'], groups)
    # x['homogeneity'] = metrics.homogeneity_score(true_labels, v[0])
    # x['completeness'] = metrics.completeness_score(true_labels, v[0])
    x['precision'] = my_precision(v[0], true_labels)
    x['recall'] = my_recall(v[0], true_labels)
    x['my_fmi'] = my_FMI(v[0], true_labels)
    x['fowlkes_mallows_score'] = metrics.fowlkes_mallows_score(true_labels, v[0])
    extended[k] = x

In [51]:
for k, v in extended.items():
    print(k, v['noise'], v['precision'], v['recall'], v['fowlkes_mallows_score'])

wmd 0 0.9237536656891495 1.0 0.961121046325
editd 86 0.1330715532286213 0.9682539682539683 0.358952725962
paragraphd 89 0.07867667500695023 0.8984126984126984 0.265864860211
eidt_semantic_d 1 0.7652811735941321 0.9936507936507937 0.872021929488
