In [593]:
import requests
import re
import time
import numpy as np
from scipy.sparse import lil_matrix
from sklearn.metrics import pairwise 
import pickle
from datetime import timedelta
from crashsimilarity.downloader import SocorroDownloader
from crashsimilarity import utils
from gensim.models import doc2vec
from gensim import corpora
import crashsimilarity.tmp as tmp
import logging

In [479]:
from pyemd import emd
import pyximport
pyximport.install()

(None, <pyximport.pyximport.PyxImporter at 0x7f78f7b27160>)

In [22]:
BASE_URL = 'https://bugzilla.mozilla.org/rest/bug?'
# url = 'https://bugzilla.mozilla.org/rest/bug?product=Firefox&version=51 branch&limit=1000'

In [49]:
url1 = 'https://bugzilla.mozilla.org/rest/bug?include_fields=id,summary,status,cf_crash_signature,see_also&f1=cf_crash_signature&f2=see_also&o1=isnotempty&o2=isnotempty&product=Core&product=Firefox'

In [50]:
r1 = requests.get(url1)
r1

<Response [200]>

In [54]:
data = r1.json()
bugs = data['bugs']
len(bugs)

10000

In [222]:
see_also = [i for i in bugs if i['see_also'] and any(['bugzilla.mozilla' in s for s in i['see_also']])]
len(see_also)

69

In [160]:
def clean_signatures(sigs):
    sigs = [re.sub(r'\(.*\)', '', s.strip('[] ')) for s in sigs.split('\r\n')]
    sigs = [s[2:] if s.startswith('@ ') else s for s in sigs]
    sigs = [s.strip() for s in sigs]
    return list(set(sigs))
multiple_sig = [i for i in bugs if len(clean_signatures(i['cf_crash_signature'])) > 1]
for sig in multiple_sig:
    sig['clean'] = clean_signatures(sig['cf_crash_signature'])
len(multiple_sig)

686

In [97]:
multiple_sig.sort(key=lambda x: len(x['clean']), reverse=True)

In [171]:
traces = []
prev_t = time.time()
for i, bug in enumerate(multiple_sig):
    if i % 100 == 0:
        print('{}, time spent for last 100: {} (s)'.format(i, time.time() - prev_t))
        prev_t = time.time()
    cur = []
    for sig in bug['clean']:
        try:
            r = list(SocorroDownloader().download_stack_traces_for_signature(sig, traces_num=1, period=timedelta(days=350)))[0]
            if r:
                cur.append(r)
        except:
            pass
    traces.append(cur)

0, time spent for last 100: 0.00019550323486328125 (s)
100, time spent for last 100: 318.5518651008606 (s)
200, time spent for last 100: 400.7205352783203 (s)
300, time spent for last 100: 450.31249618530273 (s)
400, time spent for last 100: 383.8052191734314 (s)
500, time spent for last 100: 537.849912405014 (s)
600, time spent for last 100: 615.417822599411 (s)


In [190]:
prepared = []
for group in traces:
    processed = [utils.StackTraceProcessor.preprocess(i) for i in group]
    prepared.append(processed)

In [198]:
for i, p in enumerate(prepared):
    multiple_sig[i]['prepared'] = p

In [4]:
vocab = pickle.load(open('data/compressed_vocab.pickle', 'rb'))
len(vocab)

222895

In [2]:
# pickle.dump(multiple_sig, open('data/multiple_sig.pickle', 'wb'))
multiple_sig = pickle.load(open('data/multiple_sig.pickle', 'rb'))
prepared = [sig['prepared'] for sig in multiple_sig]
len(multiple_sig)

686

In [244]:
groups = []
corpus = []
for p in prepared:
    group = []
    for t in p:
        corpus.append(t)
        group.append(len(corpus)-1)
    groups.append(group)
compressed_corpus = [[str(vocab.get(i, i)) for i in c] for c in corpus]
compressed_corpus = [doc2vec.TaggedDocument(trace, [i]) for i, trace in enumerate(compressed_corpus)]

In [6]:
compressed_corpus[42]

TaggedDocument(words=['2595', '571', '1510', '348', '349', '3443', '11904', '2787', '348', '572', '573'], tags=[42])

In [7]:
model = doc2vec.Doc2Vec.load('data/model/dm_d200_all.model')
model

<gensim.models.doc2vec.Doc2Vec at 0x7f7969e84cc0>

In [8]:
len([i for i in prepared if len(i) > 2])

95

In [698]:
def fast_rwmd_distances(model, corpus, dist, w2pos, idx):
    words = corpus[idx].words
    words = [w for w in words if w in model]
    idx_poss = [w2pos[w] for w in words if w in w2pos]
    distances = []
    for ind, doc in enumerate(corpus):
        words_pos = [w2pos[w] for w in doc.words if w in w2pos]
        if words_pos and idx_poss:
            s_dists = np.zeros((len(words_pos), len(idx_poss)), dtype=np.double)
            for i, d in enumerate(words_pos):
                for j, f in enumerate(idx_poss):
                    s_dists[i, j] = dist[d][f]
            rwmd = max(np.sum(np.min(s_dists, axis=0)), np.sum(np.min(s_dists, axis=1)))
        else:
            rwmd = float('inf')
        distances.append((ind, rwmd))
    return distances

In [699]:
def create_distance_matrix(model, dictionary, all_distances):
    distances = np.zeros((len(dictionary), len(dictionary)), dtype=np.double)
    
    view = list(dictionary.items())
    for i, w1 in dictionary.items():
        for j, w2 in view:
            distances[i, j] = all_distances[int(w1), int(w2)]

    return distances


def fast_wmdistance(model, words1, words2, w2pos, dist):
    
    words1 = [w for w in words1 if w in model]
    words1 = [str(w2pos[w]) for w in words1 if w in w2pos]
    words2 = [w for w in words2 if w in model]
    words2 = [str(w2pos[w]) for w in words2 if w in w2pos]
    
    dictionary = corpora.Dictionary(documents=[words1, words2])
    vocab_len = len(dictionary)

    if len(words1) == 0 or len(words1) == 0:
        return np.double('inf')
    
    if vocab_len == 1:
        return 0.0
    
    # create bag of words from document
    def create_bow(doc):
        norm_bow = np.zeros(vocab_len, dtype=np.double)
        bow = dictionary.doc2bow(doc)

        for idx, count in bow:
            norm_bow[idx] = count / float(len(doc))

        return norm_bow

    bow1 = create_bow(words1)
    bow2 = create_bow(words2)

    distances = create_distance_matrix(model, dictionary, dist)

    return emd(bow1, bow2, distances)

In [644]:
def words_distance(corpus, model):
    model.init_sims(replace=True)
    words = set()
    for trace in corpus:
        for w in trace.words:
            words.add(w)
    words = [w for w in words if w in model]
    print ('len(words) = {}'.format(len(words)))
    w2pos = dict([(w, i) for i, w in enumerate(words)])
    wi = [model.wv.vocab[word].index for word in words]
    wv = np.array([model.wv.syn0norm[i] for i in wi])
    # dist = 1 - cosine_similarity(wv)
    # dist = np.zeros((len(wi), len(wi)), dtype=np.double)
    # for i, t1 in enumerate(wi):
    #    if i % 500 == 0:
    #        print (i)
    #    for j, t2 in enumerate(wi):
    #        if j > i:
    #            dist[i, j] = np.sqrt(np.sum((model.wv.syn0norm[t1] - model.wv.syn0norm[t2])**2))
    #            dist[j, i] = dist[i, j]
    dist = pairwise.euclidean_distances(wv)
    return dist, w2pos

In [656]:
dist, w2pos = words_distance(compressed_corpus, model)

len(words) = 4103


In [646]:
group_more5 = [g for g in groups if len(g) >= 5]
len(group_more5)

26

In [700]:
model.wmdistance(compressed_corpus[551].words, compressed_corpus[552].words)

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(25 unique tokens: ['973', '3608', '1072', '11832', '11833']...) from 2 documents (total 46 corpus positions)


0.04838442680510924

In [703]:
fast_wmdistance(model, compressed_corpus[551].words, compressed_corpus[552].words, w2pos, dist)

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(25 unique tokens: ['3896', '2295', '915', '1467', '2123']...) from 2 documents (total 46 corpus positions)


0.04838442680510924

In [489]:
pos2w = dict([(i[1], i[0]) for i in w2pos.items()])
pos2vocab = dict([(i[1], i[0]) for i in vocab.items()])

In [741]:
def group_score(group, dist):
    closest = [[x[0] for x in sorted(rwmd_all[i], key=lambda x:x[1])] for i in group]
    score = []
    for i, c in enumerate(closest):
        others = others = [x for x in group if x != group[i]]
        pos = [c.index(o) for o in others]
        score.append((np.mean(pos), np.max(pos)))
    return score

In [394]:
closest = sorted(list(enumerate(dist[551])), key=lambda x:x[1])
others = [x for x in g if x != g[0]]

In [713]:
logging.root.setLevel(logging.CRITICAL)
t = time.time()
res = []
idx = g[0]
for i in range(len(compressed_corpus)):
    d = fast_wmdistance(model, compressed_corpus[idx].words, compressed_corpus[i].words, w2pos, dist)
    res.append(d)
print (time.time() - t)
logging.root.setLevel(logging.INFO)

2.2254691123962402


In [711]:
closest = sorted(rwmd_all[g[0]], key=lambda x:x[1])

In [715]:
closest_wmd = sorted(list(enumerate(res)), key=lambda x:x[1])

In [716]:
closest_wmd[:10]

[(551, 0.0),
 (552, 0.04838442680510924),
 (555, 0.1415106299697912),
 (554, 0.2816956092271322),
 (553, 0.3483668778315673),
 (922, 0.45063622904741113),
 (923, 0.4530710519116199),
 (531, 0.5792857161297745),
 (909, 0.753549104651525),
 (252, 0.7541491725111394)]

In [731]:
closest = [[x[0] for x in sorted(rwmd_all[i], key=lambda x:x[1])] for i in g]

In [738]:
closest = [[x[0] for x in sorted(rwmd_all[i], key=lambda x:x[1])] for i in g]
score = []
for i, c in enumerate(closest):
    others = others = [x for x in g if x != g[i]]
    pos = [c.index(o) for o in others]
    score.append((np.mean(pos), np.max(pos)))
return score

In [743]:
group_score(g, rwmd_all)

[(2.5, 4), (2.5, 4), (3.0, 6), (2.5, 4), (2.5, 4)]

In [739]:
group

[(2.5, 4), (2.5, 4), (3.0, 6), (2.5, 4), (2.5, 4)]

In [719]:
group_score(g, rwmd_all)

1.6000000000000001

In [708]:
g

[551, 552, 553, 554, 555]

In [707]:
rwmd_all = []
for i, c in enumerate(compressed_corpus):
    if i % 200 == 0:
        print (i)
    r = fast_rwmd_distances(model, compressed_corpus, dist, w2pos, i)
    rwmd_all.append(r) 

0
200
400
600
800
1000


In [671]:
# logging.root.setLevel(logging.CRITICAL)
# t = time.time()
# res2 = []
# idx = gg[0]
# for i in range(len(compressed_corpus)):
#     if i % 200 == 0:
#         print (i)
#         print ('time: {}'.format(time.time() - t))
#     d = model.wmdistance(compressed_corpus[idx].words, compressed_corpus[i].words)
#     res2.append(d)
# print (time.time() - t)
# logging.root.setLevel(logging.INFO)

0
time: 0.0005135536193847656
200
time: 1.484095573425293
400
time: 3.3038580417633057
600
time: 4.932237148284912
800
time: 6.652534008026123
1000
time: 8.349512577056885
8.72649621963501


In [460]:
di = list(enumerate(dist))
di.sort(key=lambda x: x[1])
di[:10]

[(316, 0.0),
 (323, 0.0),
 (324, 0.36658781827409354),
 (318, 0.5223399069855663),
 (322, 0.5613620322809928),
 (320, 0.563557141105309),
 (321, 0.7237989452527461),
 (530, 0.8569835854804715),
 (534, 0.8577134940988614),
 (536, 0.8745012259783078)]

In [462]:
gg = group_more5[4]
others = gg[1:]
gg

[316, 317, 318, 319, 320, 321, 322, 323, 324]

In [447]:
g

[551, 552, 553, 554, 555]

In [365]:
others

[552, 553, 554, 555]

In [395]:
len(closest)

4103

In [370]:
closest.index(554)

1752

In [362]:
score = []
for gg in g:
    closest = [x[0] for x in sorted(list(enumerate(dist[gg])), key=lambda x:x[1])]
    closest = closest[1:]
    others = [x for x in g if x != gg]
    

In [458]:
gg = group_more5[4]
others = gg[1:]

In [335]:
print(np.array_str(g_d, precision=3, suppress_small=True))

[[-0.     0.885  0.969  0.983  1.021]
 [ 0.885  0.     1.055  1.152  1.164]
 [ 0.969  1.055 -0.     1.037  1.043]
 [ 0.983  1.152  1.037 -0.     0.788]
 [ 1.021  1.164  1.043  0.788 -0.   ]]


In [327]:
g_d = np.zeros((len(g), len(g)), dtype=np.double)
for i, x in enumerate(g):
    for j, xx in enumerate(g):
        g_d[i, j] = dist[i, j]