In [1]:
import requests
import re
import time
import numpy as np
import pickle
from datetime import timedelta
from crashsimilarity.downloader import SocorroDownloader
from crashsimilarity import utils
from crashsimilarity.distance import DistanceCalculator
from gensim.models import doc2vec
import crashsimilarity.tmp as tmp
import logging

In [15]:
signature1 = 'mozilla::net::nsHttpConnection::CloseConnectionFastOpenTakesTooLongOrError'
signature2 = 'OOM | large | js::AutoEnterOOMUnsafeRegion::crash | js::AutoEnterOOMUnsafeRegion::crash | js::TenuringTracer::moveToTenured'

In [6]:
def download_stack_traces(sigs, traces_num=1):
    from_date = utils.utc_today() - timedelta(days=360)
    if isinstance(sigs, str):
        sigs = [sigs]
    params = {'signature': ['^'+i for i in sigs],
              'date': ['>=' + str(from_date)],
              '_facets': ['proto_signature'],
              '_facets_size': traces_num
             }
    _SUPER_SEARCH_URL = 'https://crash-stats.mozilla.com/api/SuperSearch'
    r = requests.get(_SUPER_SEARCH_URL, params)
    return r.json()['facets']['proto_signature']

In [20]:
# traces1 = download_stack_traces(signature1, traces_num=1000)
traces2 = download_stack_traces(signature2, traces_num=1000)

In [28]:
traces2 = download_stack_traces(signature2, traces_num=1000)

In [29]:
traces2 = set([i['term'] for i in traces2])
corpus = []
for x in traces2:
    words = utils.StackTraceProcessor.preprocess(x)
    if words[0] == '@0x':
        words = words[1:]
    corpus.append(words)

In [31]:
vocab = pickle.load(open('data/compressed_vocab.pickle', 'rb'))
len(vocab)

222895

In [33]:
compressed_corpus = [[str(vocab.get(i, i)) for i in c] for c in corpus]
compressed_corpus = [doc2vec.TaggedDocument(trace, [i]) for i, trace in enumerate(compressed_corpus)]


In [35]:
def edit_distance(s1, s2, dist):
    if len(s1) < len(s2):
        return edit_distance(s2, s1, dist)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            d = [dist(s1[i], s2[j]), #insert
                 dist(s2[j], s2[j-1]), #del
                 dist(c1, c2)] #subst
            d = [2 if i < 0 else i for i in d]
            insertions = previous_row[j + 1] + d[0]
            deletions = current_row[j] + d[1]      
            substitutions = previous_row[j] + (c1 != c2) * d[2]
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

In [38]:
def edit_distance_matrix(corpus, calculator, prog=10):
    dist = np.zeros((len(corpus), len(corpus)), dtype=np.double)
    idx = []
    for i in range(len(corpus)):
        for j in range(i+1, len(corpus)):
            idx.append((i, j))
    say = len(idx) // prog
    t = time.time()
    for s, (i, j) in enumerate(idx):
        if s and s % say == 0:
            print('{}%, {} s.'.format(s / (len(idx) * 0.01), time.time() - t))
        doc1 = corpus[i].words
        doc2 = corpus[j].words
        dist[i, j] = dist[j, i] = calculator(doc1, doc2)
    return dist

In [42]:
# def calculator(s1, s2):
#     return edit_distance(s1, s2, lambda a, b: 1)
# no_coff_edit_dist_mat = edit_distance_matrix(compressed_corpus, calculator)
# pickle.dump(no_coff_edit_dist_mat, open('data/silhouette_one_signature/sig_2_edit_no_coff.pickle', 'wb'))

In [44]:
model = doc2vec.Doc2Vec.load('data/model/dm_d200_all.model')
model

INFO:gensim.utils:loading Doc2Vec object from data/model/dm_d200_all.model
INFO:gensim.utils:loading wv recursively from data/model/dm_d200_all.model.wv.* with mmap=None
INFO:gensim.utils:loading syn0 from data/model/dm_d200_all.model.wv.syn0.npy with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:loading docvecs recursively from data/model/dm_d200_all.model.docvecs.* with mmap=None
INFO:gensim.utils:loading doctag_syn0 from data/model/dm_d200_all.model.docvecs.doctag_syn0.npy with mmap=None
INFO:gensim.utils:loading syn1neg from data/model/dm_d200_all.model.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:loaded data/model/dm_d200_all.model


<gensim.models.doc2vec.Doc2Vec at 0x7efbdcd5fd30>

In [45]:
dist, w2pos =  DistanceCalculator.words_distance(compressed_corpus, model)
distance_calculator = DistanceCalculator(model, w2pos, dist)
pos2w = dict([(i[1], i[0]) for i in w2pos.items()])
pos2vocab = dict([(i[1], i[0]) for i in vocab.items()])

INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


len(words) = 681


In [49]:
# def my_edit_distance(s1, s2):
#     return edit_distance(s1, s2, distance_calculator.compressed_words_dist)
# coff_edit_dist_mat = edit_distance_matrix(compressed_corpus, my_edit_distance)
# pickle.dump(coff_edit_dist_mat, open('data/silhouette_one_signature/sig_2_edit_coff.pickle', 'wb'))

In [51]:
coff_edit_dist_mat.shape

(1000, 1000)

In [91]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

db = DBSCAN(eps = 5, min_samples=5, metric='precomputed').fit(coff_edit_dist_mat)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)
metrics.silhouette_score(coff_edit_dist_mat, labels, metric='euclidean')

Estimated number of clusters: 15


0.25065416365025067

In [111]:
db2 = DBSCAN(eps = 5, min_samples=3, metric='precomputed').fit(no_coff_edit_dist_mat)
labels2 = db2.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters2_ = len(set(labels2)) - (1 if -1 in labels2 else 0)

print('Estimated number of clusters: %d' % n_clusters2_)
metrics.silhouette_score(no_coff_edit_dist_mat, labels2, metric='euclidean')

Estimated number of clusters: 28


0.12032512438051259

In [92]:
def wmd_mat(corpus, calculator, prog=10):
    wmd_dist = np.zeros((len(corpus), len(corpus)), dtype=np.double)
    idx = []
    for i in range(len(corpus)):
        for j in range(i+1, len(corpus)):
            idx.append((i, j))
    say = len(idx) // prog
    t = time.time()
    for s, (i, j) in enumerate(idx):
        if s and s % say == 0:
            print('{}%, {} s.'.format(s / (len(idx) * 0.01), time.time() - t))
        doc1 = corpus[i].words
        doc2 = corpus[j].words
        wmd_dist[i, j] = wmd_dist[j, i] = calculator.fast_wmd(doc1, doc2)
    return wmd_dist

In [93]:
wmd_dist_mat = wmd_mat(compressed_corpus, distance_calculator)

10.0%, 66.15560245513916 s.
20.0%, 127.54200649261475 s.
30.0%, 198.4199869632721 s.
40.0%, 260.1379313468933 s.
50.0%, 322.2272343635559 s.
60.0%, 381.7083694934845 s.
70.0%, 445.0794062614441 s.
80.0%, 507.2526469230652 s.
90.0%, 574.6762454509735 s.


In [105]:
db3 = DBSCAN(min_samples=5, metric='precomputed').fit(wmd_dist_mat)
labels3 = db3.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters3_ = len(set(labels3)) - (1 if -1 in labels3 else 0)

print('Estimated number of clusters: %d' % n_clusters3_)
metrics.silhouette_score(no_coff_edit_dist_mat, labels3, metric='euclidean')

Estimated number of clusters: 6


-0.026525818282212354

In [110]:
labels_true = [0, 0, 0, 1, 1, 1, 2, 2, 2]
labels_pred = [0, 0, 1, 1, 1, 2, 2, 2, 2]
metrics.fowlkes_mallows_score(labels_true, labels_pred)

0.52704627669472992