In [2]:
import requests
import re
import time
import numpy as np
import pickle
from datetime import timedelta
from crashsimilarity.downloader import SocorroDownloader
from crashsimilarity import utils
from crashsimilarity.distance import DistanceCalculator
from gensim.models import doc2vec
import crashsimilarity.tmp as tmp
import logging

In [1]:
BASE_URL = 'https://bugzilla.mozilla.org/rest/bug?'
fields = ['id', 'summary', 'status', 'cf_crash_signature', 'see_also']
from_date = '2015-05-31'
to_date = '2016-05-31'
suffix = '&chfield=[Bug creation]&chfieldfrom='+from_date+'&chfieldto='+to_date+'&f2=cf_crash_signature&o2=isnotempty&product=Core&product=Firefox'
url_cf_not_empty_core_firefox_last_year = BASE_URL+'include_fields='+','.join(fields)+suffix

In [3]:
r1 = requests.get(url_cf_not_empty_core_firefox_last_year)
data = r1.json()
bugs = data['bugs']
len(bugs)

1844

In [4]:
see_also = [i for i in bugs if i['see_also'] and any(['bugzilla.mozilla' in s for s in i['see_also']])]
len(see_also)

299

In [5]:
def clean_signatures(sigs):
    sigs = [re.sub(r'\(.*\)', '', s.strip('[] ')) for s in sigs.split('\r\n')]
    sigs = [s[2:] if s.startswith('@ ') else s for s in sigs]
    sigs = [s.strip() for s in sigs]
    return list(set(sigs))
multiple_sig = [i for i in bugs if len(clean_signatures(i['cf_crash_signature'])) > 1]
for sig in multiple_sig:
    sig['clean'] = clean_signatures(sig['cf_crash_signature'])
len(multiple_sig)

317

In [11]:
multiple_sig.sort(key=lambda x: len(x['clean']), reverse=True)

In [13]:
def download_stack_traces(sigs, traces_num=1):
    from_date = utils.utc_today() - timedelta(days=360)
    if isinstance(sigs, str):
        sigs = [sigs]
    params = {'signature': ['^'+i for i in sigs],
              'date': ['>=' + str(from_date)],
              '_facets': ['proto_signature'],
              '_facets_size': traces_num
             }
    _SUPER_SEARCH_URL = 'https://crash-stats.mozilla.com/api/SuperSearch'
    r = requests.get(_SUPER_SEARCH_URL, params)
    return r.json()['facets']['proto_signature']

In [6]:
multiple_sig = [i for i in multiple_sig if len(i['clean']) < 20]

In [69]:
for i in range(len(traces)):
    multiple_sig[i]['prepared'] = groups[i]

In [72]:
# pickle.dump(multiple_sig, open('data/multiple_sig_last_year.pickle', 'wb'))

In [14]:
traces = []
t = time.time()
for i, sig in enumerate(multiple_sig):
    if i % 20 == 0:
        print(i, time.time() - t)
    cur = [download_stack_traces(x) for x in sig['clean']]
    traces.append(cur)
print(i, time.time() - t)

0 0.0002472400665283203
20 76.9627845287323
40 159.09608268737793
60 254.4182641506195
80 343.0237798690796
100 418.8179714679718
120 476.75723099708557
140 612.9023175239563
160 693.8782925605774
180 799.8837325572968
200 880.2676274776459
220 992.3882038593292
240 1080.863703250885
260 1190.1470143795013
280 1305.6669027805328
300 1432.0678486824036
311 1486.387377500534


In [20]:
# groups = []
# for i, t in enumerate(traces):
#     clean = []
#     for x in t:
#         if x:
#             x = x[0]
#             words = utils.StackTraceProcessor.preprocess(x['term'])
#             if words[0] == '@0x':
#                 words = words[1:]
#             clean.append(words)
#     groups.append(clean)

In [27]:
# for i, p in enumerate(groups):
#     multiple_sig[i]['prepared'] = p

In [28]:
# pickle.dump(multiple_sig, open('data/model-size/multiple_sig_year_before.pickle', 'wb'))

In [29]:
# # pickle.dump(multiple_sig, open('data/multiple_sig.pickle', 'wb'))
# multiple_sig = pickle.load(open('data/multiple_sig_last_year.pickle', 'rb'))
# prepared = [sig['prepared'] for sig in multiple_sig]
# len(multiple_sig)

In [30]:
# groups = []
# corpus = []
# for p in prepared:
#     group = []
#     for t in p:
#         corpus.append(t)
#         group.append(len(corpus)-1)
#     groups.append(group)
# compressed_corpus = [[str(vocab.get(i, i)) for i in c] for c in corpus]
# compressed_corpus = [doc2vec.TaggedDocument(trace, [i]) for i, trace in enumerate(compressed_corpus)]
# groups = [g for g in groups if len(g) > 1]

In [32]:
# model = doc2vec.Doc2Vec.load('data/model/dm_d200_all.model')
# model

In [31]:
# dist, w2pos =  DistanceCalculator.words_distance(compressed_corpus, model)
# distance_calculator = DistanceCalculator(model, w2pos, dist)
# pos2w = dict([(i[1], i[0]) for i in w2pos.items()])
# pos2vocab = dict([(i[1], i[0]) for i in vocab.items()])