### Load the Duplicated pairs from the ELK, categorized by years, tasks (snaptask or wisdom), and apps

You can download the `elk_result.pickle` data from https://drive.google.com/file/d/1WHASgopxAo1nk279p70LJppdMZyfj4k4/view?usp=sharing.

In [1]:
import pickle

with open('elk_result.pickle', 'rb') as f:
    elk_result = pickle.load(f)

### Defining the cassandra client for feature_pool and bug_report querying given `report_id`

- Features created by wisdom team right now is stored in `wisdom_classifier.feature_pool` table
- Features that are not yet being used by either quality score or dedup model is stored in `wisdom.bug_reports`, this table contains network_logs, analytics_logs, console_logs etc.
- When doing the query with `report_id`, do make sure `case_sensitive` is setup to `False` as the cassandra table is currently mixing `report_id` will all cases.

In [2]:
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
from cassandra.cluster import Cluster, ExecutionProfile, EXEC_PROFILE_DEFAULT
from cassandra.query import BatchStatement
from cassandra.auth import PlainTextAuthProvider
import pandas as pd
import datetime
import os
import editdistance
from sklearn.utils import shuffle

pd.options.display.max_colwidth = 500


"""
Install cassandra via 

brew install cassandra

Then run this on local machine

ssh -MfN -L 9042:schemadock3128-dca1:9042 adhoc20-dca1
"""

class CassandraClient:
    def __init__(self, contact_points, port, username, password):
        auth_provider = PlainTextAuthProvider(username=username, password=password)
        profile = ExecutionProfile(request_timeout=10000)
        cluster = Cluster(contact_points=[contact_points], port=port, auth_provider=auth_provider, 
                          idle_heartbeat_interval=150, idle_heartbeat_timeout=150, \
                          execution_profiles={EXEC_PROFILE_DEFAULT: profile})
        self.wisdom_session = cluster.connect('wisdom')
        self.wisdom_classifier_session = cluster.connect('wisdom_classifier')
    
    def execute(self, session, cql):
        return [row for row in session.execute(cql)]
        
    def get_bug_report(self, report_id, columns = None):
        table_name = 'bug_reports'
        if not columns:
            columns = ['*']
            
        cql = """
            select {columns}
            from {tablename}
            where report_uuid = '{report_id}' ALLOW FILTERING;""".format(columns=','.join(columns),
            tablename=table_name, report_id=report_id)
        
        res = self.execute(self.wisdom_session, cql)
    
        if len(res) > 0:
                return res[0]
        else:
            return None
        
    def _get_case_insensitive_ids(self, report_ids):
        new_report_ids = []
        for report_id in report_ids:
            new_report_ids.extend([report_id.lower(), report_id.upper()])
                
        return new_report_ids
    
    def get_bug_reports(self, report_ids, case_sensitive = False, columns = None):
        table_name = 'bug_reports'
        if not columns:
            columns = ['*']
        
        if not case_sensitive:
            report_ids = self._get_case_insensitive_ids(report_ids)

        cql = """
            select {columns}
            from {tablename}
            where report_uuid in ('{report_ids}') ALLOW FILTERING;""".format(columns=','.join(columns), \
                                                                             tablename=table_name, 
                                                                             report_ids='\',\''.join(report_ids))
        return self.execute(self.wisdom_session, cql)
    
    def get_report_feature_pool(self, report_id, columns = None):
        table_name = 'feature_pool'
        if not columns:
            columns = ['*']
                    
        cql = """
            select {columns} from {tablename}
            where report_id = '{report_id}' ALLOW FILTERING;""".format(columns=','.join(columns), \
                                                                       tablename=table_name, report_id=report_id)
        
        res = self.execute(self.wisdom_classifier_session, cql)
        
        if len(res) > 0:
            return res[0]
        else:
            return None
        
    def get_report_feature_pools(self, report_ids, case_sensitive = False, columns = None):
        table_name = 'feature_pool'
        if not columns:
            columns = ['*']
        
        if not case_sensitive:
            report_ids = self._get_case_insensitive_ids(report_ids)
        
        cql = """
            select * from {tablename}
            where report_id in ('{report_ids}') ALLOW FILTERING;""".format(tablename=table_name, \
                                                                           report_ids='\',\''.join(report_ids))
        
        res = self.execute(self.wisdom_classifier_session, cql)
        
        return res
        
    def get_reports_feature_pool_with_time_duration(self, time_duration):
        table_name = 'feature_pool'
        
        date_input = datetime.datetime.today()
        date1 = date_input.strftime('%Y-%m-%d')
        date2 = (date_input-datetime.timedelta(time_duration)).strftime('%Y-%m-%d')
        
        print('date1 is : {}\n'.format(date1))
        print('date2 is : {}\n'.format(date2))
        
        cql = """
            select report_id, created_at, report_created_at, title, description, tag, screenshot_hash_avg, screenshot_hash_perception, screenshot_hash_diff, 
            text_tf_ngram, title_tf_ngram, feature_descr_tf_ngram, app_name, platform
            from {tablename} 
            where report_created_at > '{date2}' and report_created_at <= '{date1}' 
            ALLOW FILTERING;""".format(tablename=table_name, date1=date1, date2=date2)
        
        return self.execute(self.wisdom_classifier_session, cql)

contact_points = "localhost"
port = 9042
username = "apphealth_cassandra"
password = "xxx"

wisdom_cassandra_client = CassandraClient(contact_points, port, username, password)

### Querying cassandra database by calling this method.


- `get_report_id_to_features` will create a mapping between `report_id` and its corrresponding `bug_report` (from wisdom.bug_reports) and `feature_pool` (from `wisdom_classifier.feature_pool`)


- `get_random_pairs` will generate the corresponding nondup pairs given the dup pairs and list of `report_ids`

In [3]:
import random

# Generate random pairs that are not duplicated as contrast training data
def get_random_pairs(report_ids, dup_pairs, n_random_samples, random_seed=7):
    random.seed(random_seed)
    random_pairs_set = set()
    df = pd.DataFrame(dup_pairs)
    while len(random_pairs_set) < n_random_samples:
        p = random.sample(set(report_ids), 2)
        if (df[(df[0] == p[0]) & (df[1] == p[1])].empty) and (df[(df[1] == p[0]) & (df[0] == p[1])].empty):
            random_pairs_set.add(frozenset(p))
            
    random_pairs = [list(random_pair) for random_pair in random_pairs_set]
    
    return random_pairs

def get_report_id_to_features(elk_result):
    report_id_to_features = {}
    for task in elk_result.keys():
        for year in elk_result[task].keys():
            for app in elk_result[task][year].keys():
                report_ids = elk_result[task][year][app]['report_ids']

                bug_reports = wisdom_cassandra_client.get_bug_reports(report_ids, False)
                print('query for bug_reports of {0}, {1}, {2} has completed'.format(task, year, app))
                feature_pools = wisdom_cassandra_client.get_report_feature_pools(report_ids, False)
                print('query for feature_pools of {0}, {1}, {2} has completed'.format(task, year, app))

                bug_reports_dict = {bug_report.report_uuid.upper(): bug_report for bug_report in bug_reports}
                feature_pool_dict = {feature_pool.report_id.upper(): feature_pool for feature_pool in feature_pools}

                features = dict()
                for report_id in report_ids:
                    report_id = report_id.upper()
                    try:
                        bug_report = bug_reports_dict[report_id]
                        feature_pool = feature_pool_dict[report_id]
                        features[report_id.upper()] = {'bug_report': bug_report, 'feature_pool': feature_pool}
                    except:
                        if task == 'wisdom':
                            print(report_id + ' does not exist in cassandra table')
                            
                valid_report_ids = features.keys()
                dup_pairs = []
                for pair in elk_result[task][year][app]['pairs']:
                    if pair[0] in valid_report_ids and pair[1] in valid_report_ids:
                        dup_pairs.append(pair)

                elk_result[task][year][app]['dup_pairs'] = dup_pairs
                elk_result[task][year][app]['random_pairs'] = get_random_pairs(valid_report_ids, dup_pairs, \
                                                                               len(dup_pairs) * 30)
                
                elk_result[task][year][app]['features'] = features
                report_id_to_features.update(features)

    return report_id_to_features

report_id_to_features = get_report_id_to_features(elk_result)

query for bug_reports of snap, 2018, rider has completed
query for feature_pools of snap, 2018, rider has completed
query for bug_reports of snap, 2018, driver has completed
query for feature_pools of snap, 2018, driver has completed
query for bug_reports of snap, 2019, rider has completed
query for feature_pools of snap, 2019, rider has completed
query for bug_reports of snap, 2019, driver has completed
query for feature_pools of snap, 2019, driver has completed
query for bug_reports of wisdom, 2019, rider has completed
query for feature_pools of wisdom, 2019, rider has completed
9320D5C3-913C-47FA-96CC-C2261E86C896 does not exist in cassandra table
52E597D2-DCDD-4A17-A1EC-CB3E5F6E6919 does not exist in cassandra table
5C9A78F3-2AB1-4162-A720-9D8FA00F3A12 does not exist in cassandra table
A77E5441-1108-4AB6-9DEA-7C1FF3ED9D97 does not exist in cassandra table
0DB73EEB-E905-445E-9F79-96B2CDA85352 does not exist in cassandra table
C7918660-0B23-4DC0-B4F1-D617C69F4A88 does not exist in ca

### Add tokenizer to the text feature by doing the several following things

- Remove the stop words
- Change the word to its original form (lemmatization)
- Add a synonyms map, this is useful as in many cases, `hyperlink` means exactly as `url` and since we are not using large models for extracting these features, these rules can be encoded by ourselves by doing failure analysis

In [303]:
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import json

nlp = spacy.load('en_core_web_sm')
nlp.vocab["not"].is_stop = False

with open('synonyms.json', 'rb') as f:
    synonyms_dict = json.load(f)

class LemmaTokenizer:
    def __init__(self, synonyms_dict):
        self.synonyms_dict = synonyms_dict
    def __call__(self, doc):
        words = []
        for token in nlp(doc):
            if not token.is_stop:
                words.append(self.synonyms_dict.get(token.lemma_, token.lemma_))
                if token.pos_ == 'NUM':
                    words.append('number')
        return [self.synonyms_dict.get(token.lemma_, token.lemma_) for token in nlp(doc) if not token.is_stop]
    
lemma_tokenizer = LemmaTokenizer(synonyms_dict)

def xstr(s):
    return '' if s is None else str(s)

titles = list(filter(lambda x: x is not None, [xstr(v['feature_pool'].title) + ' ' + 
                                               xstr(v['feature_pool'].description)
                                               for v in report_id_to_features.values()]))
tfidf = TfidfVectorizer(tokenizer=lemma_tokenizer)
tfidf.fit(titles)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<__main__.LemmaTokenizer object at 0x7fd3a76800b8>,
                use_idf=True, vocabulary=None)

### Reimplementing the Java feature extrator using Python which already has,

- text ngram feature
- Feature desc feature
- Image hash feature
- If app version major.minor is matched
- If two reports are created in the same minute, or in the same hour
- If two reports are from the same city
- If two reports has the same locale
- If two reports has the same device


### At the same time, I was also adding a few more features

- Diff score for the analytics logs (levenstein distance after deduping along the sequence)
- Diff score for the network logs (levenstein distance after deduping along the sequence)
- If two logs has the same `build_id`
- If two logs has the same `category_id`
- TfIdf feature for title. Before it was vectorized, several preprocessing was done
    - Remove the stop words
    - Lemmatization for words
    - Replacing the synonyms with a user (me!) defined dictionary
    - If numbers are detected, add `number` to the sentence for calculating tfidf
    
    This is used for handling case 
        e.g: 
        ```
        1. Booking two seats on uberpool but only one booked for driver
        2. Wrong Number of Riders on Pool Request
        ```

In [306]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from bert_serving.client import BertClient
from scipy.spatial.distance import hamming

class FeatureExtractor:
    def __init__(self, report_id_to_features):
        self.report_id_to_features = report_id_to_features
        self.features = ['feature_desc', 'title_desc_tfidf', 'image_hash', 'app_version', 'device', 
                         'os_version', 'time_diff', 'city', 'locale', 'analytics_logs', 'network_logs', 'build_id', 
                         'experiments_failure']
        
    def get_pairwise_feature(self, report_id_1, report_id_2):
        feature_pool_1 = self._get_feature_pool(report_id_1)
        feature_pool_2 = self._get_feature_pool(report_id_2)
        
        # bug reports contain features that does not entailed in the feature pool right now, including the logs
        bug_report_1 = self._get_bug_report(report_id_1)
        bug_report_2 = self._get_bug_report(report_id_2)
        
        sims = []
        self._add_feature_desc_feature(feature_pool_1, feature_pool_2, sims)
        self._add_title_desc_tfidf_feature(bug_report_1, bug_report_2, sims)
        self._add_image_hash_feature(feature_pool_1, feature_pool_2, sims)
        self._add_app_version_feature(feature_pool_1, feature_pool_2, sims)
        self._add_device_feature(feature_pool_1, feature_pool_2, sims)
        self._add_os_version_feature(feature_pool_1, feature_pool_2, sims)
        self._add_time_diff_feature(feature_pool_1, feature_pool_2, sims)
        self._add_same_city_feature(feature_pool_1, feature_pool_2, sims)
        self._add_same_locale_feature(feature_pool_1, feature_pool_2, sims)
        self._add_analytics_logs_feature(bug_report_1, bug_report_2, sims)
        self._add_network_logs_feature(bug_report_1, bug_report_2, sims)
        self._add_build_id_feature(bug_report_1, bug_report_2, sims)
        self._add_experiments_feature(bug_report_1, bug_report_2, sims)
        
        return np.array(sims)
        
    def get_text_bert_feature(self, feature_pool):        
        title = feature_pool.title
        if title.strip() == '':
            title = '.'
            
        description = feature_pool.description
        if description.strip() == '':
            description = '.' 
            
        vec = np.concatenate(self.bc.encode([title + ' ' + description]))

        return vec    
    
    def get_bert_title_feature(self, report_id):
        title = feature_pool.title
        if title.strip() == '':
            title = '.'
        
        return np.concatenate(self.bc.encode([title]))
    
    def _get_feature_pool(self, report_id):
        return self.report_id_to_features[report_id.upper()]['feature_pool']
    
    def _get_bug_report(self, report_id):
        return self.report_id_to_features[report_id.upper()]['bug_report']
        
    def _add_image_hash_feature(self, feature_pool_1, feature_pool_2, sims):
        sims = []
        sims.append(hamming(feature_pool_1.screenshot_hash_avg, feature_pool_2.screenshot_hash_avg))
        sims.append(hamming(feature_pool_1.screenshot_hash_diff, feature_pool_2.screenshot_hash_diff))
        sims.append(hamming(feature_pool_1.screenshot_hash_perception, feature_pool_2.screenshot_hash_perception))
        
        return sims
    
    def _add_build_id_feature(self, bug_report_1, bug_report_2, sims):
        def get_build_uuid(bug_report):
            try:
                build_uuid = bug_report_1.meta.app.build_uuid
            except:
                build_uuid = ''
                
            return build_uuid
        
        sims.append(1.0 if get_build_uuid(bug_report_1) == get_build_uuid(bug_report_2) else 0.0)
    
    def _add_app_version_feature(self, feature_pool_1, feature_pool_2, sims):
        new_sims = [0.0, 0.0]
        app_ver_1, app_ver_2 = feature_pool_1.app_version, feature_pool_2.app_version
        if app_ver_1 is not None and app_ver_2 is not None and len(app_ver_1) == len(app_ver_2) == 3:
            if app_ver_1[0] == app_ver_2[0] and app_ver_1[1] == app_ver_2[1]:
                new_sims[0] = 1.0
                if app_ver_1[2] == app_ver_2[2]:
                    new_sims[1] = 1.0

        sims.extend(new_sims)

    def _add_device_feature(self, feature_pool_1, feature_pool_2, sims):
        dev_1, dev_2 = feature_pool_1.device_model, feature_pool_2.device_model
        if dev_1 is None or dev_2 is None:
            if dev_1 is None and dev_2 is None:
                sims.append(1.0)
            else:
                sims.append(0.0)
        else:
            sims.append(1.0 if dev_1 == dev_2 else 0.0)

    def _add_os_version_feature(self, feature_pool_1, feature_pool_2, sims):
        os_1, os_2 = feature_pool_1.os_version, feature_pool_2.os_version
        if (os_1 is not None and os_2 is not None) and len(os_1) == len(os_2) == 2:
            sims.append(1.0 if os_1[0] == os_2[0] and os_1[1] == os_2[1] else 0.0)
        else:
            sims.append(0.0)
                
    def _add_time_diff_feature(self, feature_pool_1, feature_pool_2, sims):        
        t1, t2 = feature_pool_1.created_at, feature_pool_2.created_at
        seconds_diff = abs(t1 - t2).seconds
        # if in the same minute
        sims.append(1.0 if seconds_diff < 60 else 0.0)
        # if in the same hour
        sims.append(1.0 if seconds_diff / 3600.0 < 1 else 0.0)
        
    def _add_text_bert_sim(self, feature_pool_1, feature_pool_2, sims):
        vecs = list(map(self.get_text_bert_feature, [feature_pool_1, feature_pool_2]))
        
        sims.append(self._get_cosine_similarity(vecs[0], vecs[1]))
        
    def _get_cosine_similarity(self, vec_1, vec_2):
        print(vec_1, vec_2)
        return cosine_similarity([vec_1], [vec_2])[0][0]
        
    def _add_text_ngram_feature(self, feature_pool_1, feature_pool_2, sims):
        sims.append(self._jaccard_similarity(self._get_ngram_union(feature_pool_1), \
                                             self._get_ngram_union(feature_pool_2)))
        
    def _add_title_desc_tfidf_feature(self, feature_pool_1, feature_pool_2, sims):
        s1, s2 = map(lambda fp: xstr(fp.title) + ' ' + xstr(fp.description), 
                     [feature_pool_1, feature_pool_2])
        
        tfidf_vectors = tfidf.transform([s1, s2])
        sim = cosine_similarity(tfidf_vectors)[0, 1]
        
        sims.append(sim)
        
    def _add_feature_desc_feature(self, feature_pool_1, feature_pool_2, sims):
        x = self._preprocess_ngram(feature_pool_1.feature_descr_tf_ngram)
        y = self._preprocess_ngram(feature_pool_2.feature_descr_tf_ngram)
        
        sims.append(self._jaccard_similarity(x, y))
        
    def _add_same_city_feature(self, feature_pool_1, feature_pool_2, sims):
        sims.append(1.0 if feature_pool_1.city_id == feature_pool_2.city_id else 0.0)
        
    def _add_same_locale_feature(self, feature_pool_1, feature_pool_2, sims):
        sims.append(1.0 if feature_pool_1.locale == feature_pool_2.locale else 0)
        
    def _add_analytics_logs_feature(self, bug_report_1, bug_report_2, sims):
        """
        This feature is to add editdistance score of analytics logs after deduping each individual log along the sequence,
        the deduping was made such that the order of the sequence will be kept
        
        e.g
        
        aaaabbbccd -> abcd
        """
        # keep the order
        to_unique_log = lambda bug_report: pd.unique([log.name.lower() 
                                                      for log in (bug_report.analytics_logs
                                                     if bug_report.analytics_logs is not None else [])]).tolist()
        unique_logs_1, unique_logs_2 = to_unique_log(bug_report_1), to_unique_log(bug_report_2)
        
        score = 0.0
        if len(unique_logs_1) == 0 and len(unique_logs_2) == 0:
            sims.append(score)
            return
        score = editdistance.eval(unique_logs_1, unique_logs_2) * 1.0 / max(len(unique_logs_1), len(unique_logs_2))

        sims.append(score)

    def _add_network_logs_feature(self, bug_report_1, bug_report_2, sims):
        """
        Same as analytics log, `endpoint_path:status_code` was used for each individual log term
        """
        # keep the order
        to_unique_log = lambda bug_report: pd.unique([log.endpoint_path.lower() if log.endpoint_path else '' 
                                                      + str(log.status_code) if log.status_code else ''
                                                      for log in (bug_report.network_logs
                                                     if bug_report.network_logs is not None else [])]).tolist()
        unique_logs_1, unique_logs_2 = to_unique_log(bug_report_1), to_unique_log(bug_report_2)

        score = 0.0
        if len(unique_logs_1) == 0 and len(unique_logs_2) == 0:
            sims.append(score)
            return
        score = editdistance.eval(unique_logs_1, unique_logs_2) * 1.0 / max(len(unique_logs_1), len(unique_logs_2))

        sims.append(score)
        
    def _add_experiments_feature(self, bug_report_1, bug_report_2, sims):
        """
        This feature was to identify the percentage of similarity in experiments between every two logs
        
        """
        def _get_experiments(bug_report):
            exp_dict = {}
            experiments = [] if bug_report.experiments is None else bug_report.experiments
            for exp in experiments:
                exp_dict[exp.name] = exp.group
            
            return exp_dict
        
        exp_dict_1 = _get_experiments(bug_report_1)
        exp_dict_2 = _get_experiments(bug_report_2)
        
        same_count = 0
        common_exps = set(exp_dict_1.keys()).intersection(exp_dict_2.keys())
        
        if len(common_exps) == 0:
            sims.append(1.0)
            return
        
        for exp_name in common_exps:
            same_count += (1.0 if exp_dict_1[exp_name] == exp_dict_2[exp_name] else 0.0)
                        
        sims.append(same_count / len(common_exps))

    def _get_ngram_union(self, feature_pool):
        """
        DEPRECATING!
        
        Get the bag of union words, used by the previous model, maybe replaced with tfidf vector or language model 
        in the future
        """
        
        text_tf_ngram = self._preprocess_ngram(feature_pool.text_tf_ngram)
        title_tf_ngram = self._preprocess_ngram(feature_pool.title_tf_ngram)
        
        bag_of_words = set()
        for ngram in [text_tf_ngram, title_tf_ngram]:
            bag_of_words = bag_of_words.union(set(ngram))
            
        bag_of_words = [word.lower() for word in bag_of_words]
        
        return bag_of_words
    
    def _preprocess_ngram(self, ngram):
        ngram = [gram for gram in (ngram if ngram is not None else [])]
        
        return ngram
    
    def _jaccard_similarity(self, x, y):
        """
        Jaccard Similarity J (A,B) = | Intersection (A,B) | /
                                        | Union (A,B) |
        """
        if len(x) == 0 and len(y) == 0:
            return 1.0
        elif len(x) == 0 or len(y) == 0:
            return 0.0
        intersection_cardinality = len(set(x).intersection(set(y)))
        union_cardinality = len(set(x).union(set(y)))
        return intersection_cardinality / float(union_cardinality)

feature_extractor = FeatureExtractor(report_id_to_features)

In [114]:
a = feature_extractor._get_bug_report('D0A33AD5-6D1D-4BA1-A325-9C71227B152D')
b = feature_extractor._get_bug_report('D159F4F9-DD98-4593-832F-2856D167288B')

In [109]:
b = feature_extractor._get_feature_pool('D159F4F9-DD98-4593-832F-2856D167288B')

# Preparing the training data from the feature extractor

- All dup pairs have label 1, while all nondup random pairs have label 0
- They are shuffled after these features are generated independenly for each `task:year:app` group

In [307]:
from sklearn.utils import shuffle
import numpy as np


def get_training_data_from_feature_extractor(elk_result):
    X_all, y_all = [], []
    training_data = dict()
    for task in elk_result.keys():
        training_data[task] = dict()
        for year in elk_result[task].keys():
            training_data[task][year] = dict()
            for app in elk_result[task][year].keys():

                training_data[task][year][app] = dict()

                X_positive_curr, X_negative_curr = [], []

                dup_pairs = elk_result[task][year][app]['dup_pairs']
                random_pairs = elk_result[task][year][app]['random_pairs']

                for pair in dup_pairs:
                    X_positive_curr.append(feature_extractor.get_pairwise_feature(pair[0], pair[1]))
                for pair in random_pairs:
                    X_negative_curr.append(feature_extractor.get_pairwise_feature(pair[0], pair[1]))

                X_curr = np.array(X_positive_curr + X_negative_curr)
                y_curr = np.concatenate([np.ones(len(X_positive_curr)), np.zeros(len(X_negative_curr))])

                X_curr, y_curr, shuffled_pairs = shuffle(X_curr, y_curr, dup_pairs + random_pairs)
                print(task, year, app, X_curr.shape)

                training_data[task][year][app]['X'] = X_curr
                training_data[task][year][app]['y'] = y_curr
                training_data[task][year][app]['shuffled_pairs'] = shuffled_pairs

                X_all.append(X_curr)
                y_all.append(y_curr)

    X_all = np.concatenate(X_all)
    y_all = np.concatenate(y_all)
    
    return X_all, y_all, training_data

X_all, y_all, training_data = get_training_data_from_feature_extractor(elk_result)

snap 2018 rider (9548, 14)
snap 2018 driver (93, 14)
snap 2019 rider (930, 14)
snap 2019 driver (31, 14)
wisdom 2019 rider (13888, 14)
wisdom 2019 driver (31, 14)
wisdom 2019 eats (744, 14)


In [140]:
s1 = set(frozenset(pair) for pair in elk_result['wisdom']['2019']['rider']['dup_pairs'])

In [141]:
s2 = set(frozenset(pair) for pair in elk_result['wisdom']['2019']['rider']['random_pairs'])

In [146]:
frozenset({'B6F3B553-2463-45A2-B5A5-1E54B035FC1D','704D3E76-3E83-4F3C-A393-60A6A0EA0FD2'
            }) in s2

True

In [150]:
frozenset({'438AEE6E-E9FD-4972-B69B-2F92D92C31C3',
 'A1AB750C-5255-4A3D-A469-031CE89AEB76'}) in s1

False

### Splitting the training_data into snap tickets and wisdom tickets

In [308]:
# split wisdom and snaptask data
def split_training_data_by_task(training_data):
    X_snap, y_snap, snap_shuffled_pairs, X_wisdom_dict, y_wisdom_dict, wisdom_shuffled_pairs_dict = [], [], [], {}, {}, {}
    for task in training_data.keys():
        for year in training_data[task].keys():
            for app in training_data[task][year].keys():
                X = training_data[task][year][app]['X']
                y = training_data[task][year][app]['y']
                shuffled_pairs = training_data[task][year][app]['shuffled_pairs']
                if task == 'snap':
                    X_snap.append(X)
                    y_snap.append(y)
                    snap_shuffled_pairs.append(shuffled_pairs)
                else:
                    X_wisdom_dict[app] = X
                    y_wisdom_dict[app] = y
                    wisdom_shuffled_pairs_dict[app] = shuffled_pairs
    
    X_snap, y_snap, snap_shuffled_pairs = list(map(np.concatenate, [X_snap, y_snap, snap_shuffled_pairs]))
    
    return X_snap, y_snap, snap_shuffled_pairs, X_wisdom_dict, y_wisdom_dict, wisdom_shuffled_pairs_dict

X_snap, y_snap, snap_shuffled_pairs, X_wisdom_dict, y_wisdom_dict, shuffled_pairs_dict = split_training_data_by_task(training_data)


In [None]:
np.unique(X_all[:, -1])

In [None]:
a = feature_extractor._get_feature_pool('D0A33AD5-6D1D-4BA1-A325-9C71227B152D')
b = feature_extractor._get_feature_pool('D159F4F9-DD98-4593-832F-2856D167288B')

In [None]:
report_id = 'a69e70ce-e850-48d1-9e84-ccb85067a612'.upper()

print(wisdom_cassandra_client.get_bug_report(report_id))
print(wisdom_cassandra_client.get_report_feature_pool(report_id))

In [322]:
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn import svm
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from IPython.display import display
import random
import xgboost as xgb
from skopt import BayesSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import pandas as pd
from sklearn.model_selection import StratifiedKFold


# SETTINGS - CHANGE THESE TO GET SOMETHING MEANINGFUL
ITERATIONS = 10 # 1000
TRAINING_SIZE = 100000 # 20000000
TEST_SIZE = 25000


random.seed(7)
np.random.seed(7)

FOLDS = 5
PARAM_COMB = 5

skf = StratifiedKFold(n_splits=FOLDS, shuffle = True, random_state = 1001)
# A parameter grid for XGBoost
params = {
    'min_child_weight': [5, 10],
    'gamma': [3, 5],
    'subsample': [0.1],
    'colsample_bytree': [0.6, 0.2],
    'max_depth': [3, 4, 5],
    'scale_pos_weight': [3.7]
}

model = RandomizedSearchCV(
    xgb.XGBClassifier(learning_rate=0.02, 
                      n_estimators=600, 
                      objective='binary:logistic',
                      silent=True,
                      nthread=1), 
    param_distributions=params,
    n_iter=50,
    scoring='f1_weighted',
    n_jobs=4,
    verbose=3)


"""
!!!
"""
X_train_wisdom_rider, X_test_wisdom_rider, y_train_wisdom_rider, y_test_wisdom_rider, _, shuffled_pairs_test_wisdom_rider = train_test_split(X_wisdom_dict['rider'], y_wisdom_dict['rider'], 
                                                                            shuffled_pairs_dict['rider'], 
                                                                            test_size=0.2, random_state=1)

X_train = np.concatenate([X_snap, X_train_wisdom_rider])
y_train = np.concatenate([y_snap, y_train_wisdom_rider])

X_train, y_train = X_train_wisdom_rider, y_train_wisdom_rider

model.fit(X_train, y_train)

failure_analysis = dict()
for app in X_wisdom_dict.keys():
    if app == 'rider':
        X_test = X_test_wisdom_rider
        y_test = y_test_wisdom_rider
        shuffled_pairs_test = shuffled_pairs_test_wisdom_rider
    else:
        X_test = X_wisdom_dict[app]
        y_test = y_wisdom_dict[app]
        shuffled_pairs_test = shuffled_pairs_dict[app]
                
    y_pred = model.predict(X_test)
    
    print("Test set score for {}: {:.2f}".format(app, roc_auc_score(y_test, y_pred)))
    con_matrix = confusion_matrix(y_test, y_pred)

    fp_indices = np.where((y_test == 0) & (y_pred == 1))[0]
    fn_indices = np.where((y_test == 1) & (y_pred == 0))[0]

    tn, fp, fn, tp = con_matrix.ravel()
    print('confusion matrix', con_matrix)
    print('tn: {}, fp: {}, fn: {}, tp: {}, precision: {}, recall: {}'.format(tn, fp, fn, tp, tp*1.0/(tp+fp), tp*1.0/(tp+fn)))
    
    
    
    def get_titles(shuffled_pairs_test, indices):
        titles = []
        for index in indices:
            bug_reports = [feature_extractor._get_bug_report(report_id) for report_id in shuffled_pairs_test[index]]
            titles.append(list(map(lambda b: b.report_uuid.lower(), bug_reports)))
            titles.append(list(map(lambda b: b.title, bug_reports)))
            titles.append(list(map(lambda b: b.description, bug_reports)))
        return pd.DataFrame(titles)

    fn_titles = get_titles(shuffled_pairs_test, fn_indices)
    fp_titles = get_titles(shuffled_pairs_test, fp_indices)
    
    failure_analysis[app] = {'fn_titles': fn_titles, 'fp_titles': fp_titles}

    print("Best parameters: {}".format(model.best_params_))
    print("Best cross-validation score: {:.2f}".format(model.best_score_))
    results = pd.DataFrame(model.cv_results_)
    display(results.mean_test_score)

    
# rebuild a model on the train
best_model = xgb.XGBClassifier(**model.best_params_)
best_model.fit(X_train, y_train)




[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   13.7s finished


Test set score for rider: 0.81
confusion matrix [[2673   14]
 [  35   56]]
tn: 2673, fp: 14, fn: 35, tp: 56, precision: 0.8, recall: 0.6153846153846154
Best parameters: {'subsample': 0.1, 'scale_pos_weight': 3.7, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 3, 'colsample_bytree': 0.2}
Best cross-validation score: 0.98


0    0.975831
1    0.975832
2    0.975871
3    0.977567
4    0.977844
5    0.977646
6    0.975943
7    0.978590
8    0.978299
9    0.977534
Name: mean_test_score, dtype: float64

Test set score for driver: 0.97
confusion matrix [[28  2]
 [ 0  1]]
tn: 28, fp: 2, fn: 0, tp: 1, precision: 0.3333333333333333, recall: 1.0
Best parameters: {'subsample': 0.1, 'scale_pos_weight': 3.7, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 3, 'colsample_bytree': 0.2}
Best cross-validation score: 0.98


0    0.975831
1    0.975832
2    0.975871
3    0.977567
4    0.977844
5    0.977646
6    0.975943
7    0.978590
8    0.978299
9    0.977534
Name: mean_test_score, dtype: float64

Test set score for eats: 0.86
confusion matrix [[672  48]
 [  5  19]]
tn: 672, fp: 48, fn: 5, tp: 19, precision: 0.2835820895522388, recall: 0.7916666666666666
Best parameters: {'subsample': 0.1, 'scale_pos_weight': 3.7, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 3, 'colsample_bytree': 0.2}
Best cross-validation score: 0.98


0    0.975831
1    0.975832
2    0.975871
3    0.977567
4    0.977844
5    0.977646
6    0.975943
7    0.978590
8    0.978299
9    0.977534
Name: mean_test_score, dtype: float64

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.2, gamma=3,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=3.7, seed=None,
              silent=None, subsample=0.1, verbosity=1)

In [323]:
from chart_studio.plotly import plot, iplot
import plotly.graph_objs as go
from plotly.subplots import make_subplots

importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]

fig = make_subplots(
    rows=2, cols=1,
    row_heights=[0.67, 0.33],
    start_cell="top-left")

fig.add_bar(x=feature_extractor.features, y=importances, row=1, col=1)
fig.show()