In [17]:
import requests
import re
import time
import numpy as np
import pickle
from datetime import timedelta
from crashsimilarity.downloader import SocorroDownloader
from crashsimilarity import utils
from crashsimilarity.distance import DistanceCalculator
from gensim.models import doc2vec
import crashsimilarity.tmp as tmp
import logging

In [None]:
BASE_URL = 'https://bugzilla.mozilla.org/rest/bug?'
# url = 'https://bugzilla.mozilla.org/rest/bug?product=Firefox&version=51 branch&limit=1000'

In [None]:
url1 = 'https://bugzilla.mozilla.org/rest/bug?include_fields=id,summary,status,cf_crash_signature,see_also&f1=cf_crash_signature&f2=see_also&o1=isnotempty&o2=isnotempty&product=Core&product=Firefox'

In [None]:
r1 = requests.get(url1)
r1

In [None]:
data = r1.json()
bugs = data['bugs']
len(bugs)

In [None]:
see_also = [i for i in bugs if i['see_also'] and any(['bugzilla.mozilla' in s for s in i['see_also']])]
len(see_also)

In [None]:
def clean_signatures(sigs):
    sigs = [re.sub(r'\(.*\)', '', s.strip('[] ')) for s in sigs.split('\r\n')]
    sigs = [s[2:] if s.startswith('@ ') else s for s in sigs]
    sigs = [s.strip() for s in sigs]
    return list(set(sigs))
multiple_sig = [i for i in bugs if len(clean_signatures(i['cf_crash_signature'])) > 1]
for sig in multiple_sig:
    sig['clean'] = clean_signatures(sig['cf_crash_signature'])
len(multiple_sig)

In [None]:
multiple_sig.sort(key=lambda x: len(x['clean']), reverse=True)

In [None]:
# traces = []
# prev_t = time.time()
# for i, bug in enumerate(multiple_sig):
#     if i % 100 == 0:
#         print('{}, time spent for last 100: {} (s)'.format(i, time.time() - prev_t))
#         prev_t = time.time()
#     cur = []
#     for sig in bug['clean']:
#         try:
#             r = list(SocorroDownloader().download_stack_traces_for_signature(sig, traces_num=1, period=timedelta(days=350)))[0]
#             if r:
#                 cur.append(r)
#         except:
#             pass
#     traces.append(cur)

In [None]:
prepared = []
for group in traces:
    processed = [utils.StackTraceProcessor.preprocess(i) for i in group]
    prepared.append(processed)

In [None]:
for i, p in enumerate(prepared):
    multiple_sig[i]['prepared'] = p

In [2]:
vocab = pickle.load(open('data/compressed_vocab.pickle', 'rb'))
len(vocab)

222895

In [3]:
# pickle.dump(multiple_sig, open('data/multiple_sig.pickle', 'wb'))
multiple_sig = pickle.load(open('data/multiple_sig.pickle', 'rb'))
prepared = [sig['prepared'] for sig in multiple_sig]
len(multiple_sig)

686

In [5]:
groups = []
corpus = []
for p in prepared:
    group = []
    for t in p:
        corpus.append(t)
        group.append(len(corpus)-1)
    groups.append(group)
compressed_corpus = [[str(vocab.get(i, i)) for i in c] for c in corpus]
compressed_corpus = [doc2vec.TaggedDocument(trace, [i]) for i, trace in enumerate(compressed_corpus)]

In [6]:
compressed_corpus[42]

TaggedDocument(words=['2595', '571', '1510', '348', '349', '3443', '11904', '2787', '348', '572', '573'], tags=[42])

In [7]:
model = doc2vec.Doc2Vec.load('data/model/dm_d200_all.model')
model

INFO:gensim.utils:loading Doc2Vec object from data/model/dm_d200_all.model
INFO:gensim.utils:loading wv recursively from data/model/dm_d200_all.model.wv.* with mmap=None
INFO:gensim.utils:loading syn0 from data/model/dm_d200_all.model.wv.syn0.npy with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:loading docvecs recursively from data/model/dm_d200_all.model.docvecs.* with mmap=None
INFO:gensim.utils:loading doctag_syn0 from data/model/dm_d200_all.model.docvecs.doctag_syn0.npy with mmap=None
INFO:gensim.utils:loading syn1neg from data/model/dm_d200_all.model.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:loaded data/model/dm_d200_all.model


<gensim.models.doc2vec.Doc2Vec at 0x7f9ad5f88eb8>

In [52]:
group_5_and_more = [g for g in groups if len(g) >= 5]
len(group_5_and_more)

26

In [8]:
dist, w2pos =  DistanceCalculator.words_distance(compressed_corpus, model)
distance_calculator = DistanceCalculator(model, w2pos, dist)
pos2w = dict([(i[1], i[0]) for i in w2pos.items()])
pos2vocab = dict([(i[1], i[0]) for i in vocab.items()])

INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


len(words) = 4103


In [40]:
def wmd_mat(corpus, calculator, prog=10):
    wmd_dist = np.zeros((len(corpus), len(corpus)), dtype=np.double)
    idx = []
    for i in range(len(corpus)):
        for j in range(i+1, len(corpus)):
            idx.append((i, j))
    say = len(idx) // prog
    t = time.time()
    for s, (i, j) in enumerate(idx):
        if s and s % say == 0:
            print('{}%, {} s.'.format(s / (len(idx) * 0.01), time.time() - t))
        doc1 = corpus[i].words
        doc2 = corpus[j].words
        wmd_dist[i, j] = wmd_dist[j, i] = calculator.fast_wmd(doc1, doc2)
    return wmd_dist

In [45]:
# wmd_all = wmd_mat(compressed_corpus, distance_calculator, 50)
# pickle.dump(wmd_all, open('data/wmd_all.pickle', 'wb'))
wmd_all = pickle.load(open('data/wmd_all.pickle', 'rb'))

In [46]:
wmd_all.shape

(1051, 1051)

In [61]:
def group_score(group, dist):
    closest = [[x[0] for x in sorted(enumerate(dist[i]), key=lambda x:x[1])] for i in group]
    score = []
    for i, c in enumerate(closest):
        others = others = [x for x in group if x != group[i]]
        pos = [c.index(o) for o in others]
        score.append((np.mean(pos), np.max(pos)))
    return score

In [86]:
scores = []
for group in group_5_and_more:
    scores.append(group_score(group, wmd_all))

In [87]:
scores

[[(204.25, 807), (254.0, 763), (225.0, 605), (123.0, 368), (356.5, 769)],
 [(871.0, 928), (381.5, 1012), (458.75, 1022), (429.75, 945), (572.75, 1000)],
 [(145.40000000000001, 639),
  (93.799999999999997, 157),
  (79.599999999999994, 324),
  (23.800000000000001, 37),
  (276.60000000000002, 636),
  (64.599999999999994, 119)],
 [(55.200000000000003, 138),
  (31.600000000000001, 110),
  (37.600000000000001, 126),
  (17.600000000000001, 48),
  (119.0, 254),
  (67.400000000000006, 235)],
 [(136.125, 546),
  (303.375, 653),
  (64.125, 309),
  (386.625, 605),
  (61.375, 302),
  (191.5, 841),
  (63.0, 291),
  (136.0, 546),
  (81.75, 317)],
 [(270.19999999999999, 863),
  (277.60000000000002, 771),
  (529.79999999999995, 986),
  (613.0, 776),
  (150.19999999999999, 506),
  (297.0, 891)],
 [(21.0, 63), (50.0, 77), (84.5, 201), (30.25, 110), (60.75, 89)],
 [(3.0, 5), (3.0, 5), (3.0, 5), (3.0, 5), (3.0, 5), (3.0, 5)],
 [(43.0, 50),
  (7.2000000000000002, 13),
  (8.0, 22),
  (11.4, 37),
  (47.200000

In [85]:
clear_scores = []
for group in clear_groups:
    clear_scores.append(group_score(group, wmd_all))

In [88]:
clear_scores

[[(53.0, 105), (54.0, 107), (42.5, 47)],
 [(171.33333333333334, 343),
  (271.0, 430),
  (258.0, 282),
  (430.33333333333331, 554)],
 [(19.0, 19), (3.0, 3)],
 [(50.0, 50), (32.0, 32)],
 [(136.125, 546),
  (303.375, 653),
  (64.125, 309),
  (386.625, 605),
  (61.375, 302),
  (191.5, 841),
  (63.0, 291),
  (136.0, 546),
  (81.75, 317)],
 [(322.75, 863), (324.5, 771), (415.75, 516), (611.5, 776), (316.5, 891)],
 [(11.0, 11), (2.0, 2)],
 [(3.0, 5), (3.0, 5), (3.0, 5), (3.0, 5), (3.0, 5), (3.0, 5)],
 [(45.5, 50), (7.75, 13), (13.5, 37), (57.25, 200), (14.75, 48)],
 [(5.666666666666667, 11),
  (5.8888888888888893, 11),
  (6.1111111111111107, 11),
  (6.1111111111111107, 11),
  (7.2222222222222223, 13),
  (6.7777777777777777, 13),
  (6.7777777777777777, 11),
  (6.1111111111111107, 11),
  (6.7777777777777777, 11),
  (6.1111111111111107, 11)],
 [(2.5, 4), (2.5, 4), (2.5, 4), (2.5, 4), (2.5, 4)],
 [(43.666666666666664, 104),
  (62.0, 172),
  (14.333333333333334, 18),
  (49.333333333333336, 129)],


In [93]:
def clear_groups(groups):
    rv = []
    for g in groups:
        cg = [i for i in g if len(corpus[i]) > 3]
        avg = np.average([len(corpus[i]) for i in cg])
        cg = [i for i in cg if len(corpus[i]) > avg/2]
        rv.append(cg)
    return rv

In [60]:
[corpus[i] for i in clear_groups[10]]

[['arena_dalloc_small',
  'arena_dalloc',
  'free',
  "nsprefbranch::`vector deleting destructor'",
  'nsprefbranch::release',
  'mozilla::preferences::~preferences',
  "mozilla::preferences::`scalar deleting destructor'",
  'mozilla::preferences::release',
  'mozilla::preferences::shutdown',
  'nscomponentmanagerimpl::knownmodule::~knownmodule',
  'nstarray<t>::destructrange',
  'nstarray<t>::removeelementsat',
  'nscomponentmanagerimpl::shutdown',
  'mozilla::shutdownxpcom',
  'scopedxpcomstartup::~scopedxpcomstartup',
  'xre_main',
  'wmain',
  '_rtc_initialize',
  '__tmaincrtstartup',
  'firefox.exe@0x',
  'basethreadinitthunk',
  '__rtluserthreadstart',
  '_rtluserthreadstart'],
 ['arena_dalloc_small',
  'arena_dalloc',
  'free',
  "nsprefbranch::`scalar deleting destructor'",
  'nsprefbranch::release',
  'mozilla::preferences::~preferences',
  "mozilla::preferences::`vector deleting destructor'",
  'mozilla::preferences::release',
  'mozilla::preferences::shutdown',
  'nscomponen