In [1]:
import requests
import re
import time
import numpy as np
import pickle
from datetime import timedelta
from crashsimilarity.downloader import SocorroDownloader
from crashsimilarity import utils
from crashsimilarity.distance import DistanceCalculator
from gensim.models import doc2vec
import crashsimilarity.tmp as tmp
import logging

In [22]:
BASE_URL = 'https://bugzilla.mozilla.org/rest/bug?'
# url = 'https://bugzilla.mozilla.org/rest/bug?product=Firefox&version=51 branch&limit=1000'

In [49]:
url1 = 'https://bugzilla.mozilla.org/rest/bug?include_fields=id,summary,status,cf_crash_signature,see_also&f1=cf_crash_signature&f2=see_also&o1=isnotempty&o2=isnotempty&product=Core&product=Firefox'

In [50]:
r1 = requests.get(url1)
r1

<Response [200]>

In [54]:
data = r1.json()
bugs = data['bugs']
len(bugs)

10000

In [222]:
see_also = [i for i in bugs if i['see_also'] and any(['bugzilla.mozilla' in s for s in i['see_also']])]
len(see_also)

69

In [160]:
def clean_signatures(sigs):
    sigs = [re.sub(r'\(.*\)', '', s.strip('[] ')) for s in sigs.split('\r\n')]
    sigs = [s[2:] if s.startswith('@ ') else s for s in sigs]
    sigs = [s.strip() for s in sigs]
    return list(set(sigs))
multiple_sig = [i for i in bugs if len(clean_signatures(i['cf_crash_signature'])) > 1]
for sig in multiple_sig:
    sig['clean'] = clean_signatures(sig['cf_crash_signature'])
len(multiple_sig)

686

In [97]:
multiple_sig.sort(key=lambda x: len(x['clean']), reverse=True)

In [171]:
# traces = []
# prev_t = time.time()
# for i, bug in enumerate(multiple_sig):
#     if i % 100 == 0:
#         print('{}, time spent for last 100: {} (s)'.format(i, time.time() - prev_t))
#         prev_t = time.time()
#     cur = []
#     for sig in bug['clean']:
#         try:
#             r = list(SocorroDownloader().download_stack_traces_for_signature(sig, traces_num=1, period=timedelta(days=350)))[0]
#             if r:
#                 cur.append(r)
#         except:
#             pass
#     traces.append(cur)

0, time spent for last 100: 0.00019550323486328125 (s)
100, time spent for last 100: 318.5518651008606 (s)
200, time spent for last 100: 400.7205352783203 (s)
300, time spent for last 100: 450.31249618530273 (s)
400, time spent for last 100: 383.8052191734314 (s)
500, time spent for last 100: 537.849912405014 (s)
600, time spent for last 100: 615.417822599411 (s)


In [190]:
prepared = []
for group in traces:
    processed = [utils.StackTraceProcessor.preprocess(i) for i in group]
    prepared.append(processed)

In [198]:
for i, p in enumerate(prepared):
    multiple_sig[i]['prepared'] = p

In [2]:
vocab = pickle.load(open('data/compressed_vocab.pickle', 'rb'))
len(vocab)

222895

In [3]:
# pickle.dump(multiple_sig, open('data/multiple_sig.pickle', 'wb'))
multiple_sig = pickle.load(open('data/multiple_sig.pickle', 'rb'))
prepared = [sig['prepared'] for sig in multiple_sig]
len(multiple_sig)

686

In [4]:
groups = []
corpus = []
for p in prepared:
    group = []
    for t in p:
        corpus.append(t)
        group.append(len(corpus)-1)
    groups.append(group)
compressed_corpus = [[str(vocab.get(i, i)) for i in c] for c in corpus]
compressed_corpus = [doc2vec.TaggedDocument(trace, [i]) for i, trace in enumerate(compressed_corpus)]

In [6]:
compressed_corpus[42]

TaggedDocument(words=['2595', '571', '1510', '348', '349', '3443', '11904', '2787', '348', '572', '573'], tags=[42])

In [7]:
model = doc2vec.Doc2Vec.load('data/model/dm_d200_all.model')
model

INFO:gensim.utils:loading Doc2Vec object from data/model/dm_d200_all.model
INFO:gensim.utils:loading wv recursively from data/model/dm_d200_all.model.wv.* with mmap=None
INFO:gensim.utils:loading syn0 from data/model/dm_d200_all.model.wv.syn0.npy with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:loading docvecs recursively from data/model/dm_d200_all.model.docvecs.* with mmap=None
INFO:gensim.utils:loading doctag_syn0 from data/model/dm_d200_all.model.docvecs.doctag_syn0.npy with mmap=None
INFO:gensim.utils:loading syn1neg from data/model/dm_d200_all.model.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:loaded data/model/dm_d200_all.model


<gensim.models.doc2vec.Doc2Vec at 0x7fe71d3c7780>

In [24]:
group_5_and_more = [g for g in groups if len(g) >= 5]
len(group_5_and_more)

26

In [9]:
dist, w2pos =  DistanceCalculator.words_distance(compressed_corpus, model)
distance_calculator = DistanceCalculator(model, w2pos, dist)
pos2w = dict([(i[1], i[0]) for i in w2pos.items()])
pos2vocab = dict([(i[1], i[0]) for i in vocab.items()])

INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


len(words) = 4103


In [16]:
def group_score(group, corpus, calc):
    dist = dict()
    for g in group:
        dist[g] = [(i, calc.fast_wmd(corpus[g].words, c.words)) for i, c in enumerate(corpus)]
    closest = [[x[0] for x in sorted(dist[i], key=lambda x:x[1])] for i in group]
    score = []
    for i, c in enumerate(closest):
        others = others = [x for x in group if x != group[i]]
        pos = [c.index(o) for o in others]
        score.append((np.mean(pos), np.max(pos)))
    return score

In [26]:
scores = []
for group in group_5_and_more[:10]:
    scores.append(group_score(group, compressed_corpus, distance_calculator))

In [27]:
scores

[[(204.25, 807), (254.0, 763), (225.0, 605), (123.0, 368), (356.5, 769)],
 [(871.0, 928), (381.5, 1012), (458.75, 1022), (429.75, 945), (572.75, 1000)],
 [(145.40000000000001, 639),
  (93.799999999999997, 157),
  (79.599999999999994, 324),
  (23.800000000000001, 37),
  (276.60000000000002, 636),
  (64.599999999999994, 119)],
 [(55.200000000000003, 138),
  (31.600000000000001, 110),
  (37.600000000000001, 126),
  (17.600000000000001, 48),
  (119.0, 254),
  (67.400000000000006, 235)],
 [(136.125, 546),
  (303.375, 653),
  (64.125, 309),
  (386.625, 605),
  (61.375, 302),
  (191.5, 841),
  (63.0, 291),
  (136.0, 546),
  (81.75, 317)],
 [(270.19999999999999, 863),
  (277.60000000000002, 771),
  (529.79999999999995, 986),
  (613.0, 776),
  (150.19999999999999, 506),
  (297.0, 891)],
 [(21.0, 63), (50.0, 77), (84.5, 201), (30.25, 110), (60.75, 89)],
 [(3.0, 5), (3.0, 5), (3.0, 5), (3.0, 5), (3.0, 5), (3.0, 5)],
 [(43.0, 50),
  (7.2000000000000002, 13),
  (8.0, 22),
  (11.4, 37),
  (47.200000

In [18]:
score

[(134.0, 235), (438.5, 873), (335.0, 668)]

In [19]:
group_3_and_more[0]

[43, 44, 45]

In [23]:
corpus[44]

['nsdocshell::setupnewviewer',
 'nsdocshell::embed',
 'nsdocshell::createcontentviewer',
 'nsdsuricontentlistener::docontent',
 'nsdocumentopeninfo::trycontentlistener',
 'nsdocumentopeninfo::dispatchcontent',
 'nsdocumentopeninfo::onstartrequest',
 'nsjarchannel::onstartrequest',
 'nsinputstreampump::onstatestart',
 'nsinputstreampump::oninputstreamready',
 'nsinputstreamreadyevent::run',
 'nsthread::processnextevent',
 'ns_processnextevent',
 'nsthread::shutdown',
 'mozilla::crashreporter::lspannotationgatherer::annotate',
 'nsrunnablemethodimpl<t>::run',
 'nsthread::processnextevent',
 'ns_processnextevent',
 'messageloop::runhandler',
 'nsbaseappshell::run',
 'nsappshell::run']

In [22]:
corpus[45]

['nscomptr_base::assign_with_addref',
 'documentviewerimpl::close',
 'nsdocshell::setupnewviewer',
 'nsdocshell::embed',
 'nsdocshell::createcontentviewer',
 'nsdsuricontentlistener::docontent',
 'nsdocumentopeninfo::trycontentlistener',
 'nsdocumentopeninfo::dispatchcontent',
 'nsdocumentopeninfo::onstartrequest',
 'nsjarchannel::onstartrequest',
 'nsinputstreampump::onstatestart',
 'nsinputstreampump::oninputstreamready',
 'nsinputstreamreadyevent::run',
 'nsthread::processnextevent',
 'ns_processpendingevents_p',
 'nswindow::dispatchpendingevents',
 'nswindow::processmessage',
 'nswindow::windowprocinternal',
 'callwindowproccrashprotected',
 '@0x',
 'callwindowproccrashprotected',
 '@0x',
 'stringduplicatew',
 'stringduplicatew',
 'xul.dll@0x',
 'firefox.exe@0x',
 'getcodepagefileinfo',
 'baseprocessstart',
 'firefox.exe@0x']

In [38]:
d2 = []
for c in compressed_corpus:
    d2.append(fast_rwmd_distance(model, compressed_corpus[idx].words, c.words, w2pos, dist))

In [29]:
logging.root.setLevel(logging.CRITICAL)
t = time.time()
res = []
idx = g[0]
for i in range(len(compressed_corpus)):
    d = fast_wmdistance_all(model, compressed_corpus[idx].words, compressed_corpus[i].words, w2pos, dist)
    res.append(d)
print (time.time() - t)
logging.root.setLevel(logging.INFO)

TypeError: only integer arrays with one element can be converted to an index

In [27]:
len(res)

1051

In [711]:
closest = sorted(rwmd_all[g[0]], key=lambda x:x[1])

In [715]:
closest_wmd = sorted(list(enumerate(res)), key=lambda x:x[1])

In [716]:
closest_wmd[:10]

[(551, 0.0),
 (552, 0.04838442680510924),
 (555, 0.1415106299697912),
 (554, 0.2816956092271322),
 (553, 0.3483668778315673),
 (922, 0.45063622904741113),
 (923, 0.4530710519116199),
 (531, 0.5792857161297745),
 (909, 0.753549104651525),
 (252, 0.7541491725111394)]

In [731]:
closest = [[x[0] for x in sorted(rwmd_all[i], key=lambda x:x[1])] for i in g]

In [738]:
closest = [[x[0] for x in sorted(rwmd_all[i], key=lambda x:x[1])] for i in g]
score = []
for i, c in enumerate(closest):
    others = others = [x for x in g if x != g[i]]
    pos = [c.index(o) for o in others]
    score.append((np.mean(pos), np.max(pos)))
return score

In [743]:
group_score(g, rwmd_all)

[(2.5, 4), (2.5, 4), (3.0, 6), (2.5, 4), (2.5, 4)]

In [744]:
scores = [group_score(i, rwmd_all) for i in group_more5]

In [745]:
scores[:10]

[[(491.5, 696), (230.0, 689), (198.5, 550), (396.75, 682), (587.25, 782)],
 [(973.0, 1043), (390.75, 1013), (482.5, 1024), (459.0, 959), (915.5, 987)],
 [(202.59999999999999, 514),
  (67.0, 130),
  (277.80000000000001, 518),
  (160.19999999999999, 451),
  (287.80000000000001, 532),
  (41.200000000000003, 70)],
 [(218.59999999999999, 890),
  (360.39999999999998, 870),
  (384.39999999999998, 923),
  (228.19999999999999, 857),
  (298.19999999999999, 852),
  (190.19999999999999, 715)],
 [(119.25, 483),
  (635.625, 862),
  (78.75, 313),
  (626.0, 836),
  (76.125, 300),
  (141.25, 597),
  (49.375, 194),
  (119.125, 483),
  (60.5, 253)],
 [(262.80000000000001, 787),
  (320.0, 772),
  (942.60000000000002, 988),
  (739.60000000000002, 897),
  (867.79999999999995, 935),
  (326.0, 839)],
 [(213.75, 778), (387.25, 885), (161.5, 444), (215.75, 769), (430.75, 899)],
 [(3.0, 5),
  (23.0, 79),
  (3.0, 5),
  (3.0, 5),
  (75.200000000000003, 256),
  (3.0, 5)],
 [(47.200000000000003, 53),
  (7.5999999999

In [719]:
group_score(g, rwmd_all)

1.6000000000000001