In [41]:
from pprint import pprint as _print
from datetime import datetime
import pandas as pd
from tqdm import tqdm
from joblib import dump

In [3]:
def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

In [4]:
def show(db, collection, pipeline):
    return [doc for doc in db[collection].aggregate(pipeline)]

In [5]:
db = get_db('bugDup')

## Number of bugs with duplicates

In [6]:
match = {
    "$match": {
        # The complete match block
        "dup_id": {
            "$ne": [] # Duplicated is not equal to null
        }
    }
}

count = {
    "$count": "dup_id"
}

In [7]:
n = show(db, 'mozilla', [match, count])

In [8]:
n

[{'dup_id': 176725}]

## Total number of entries

In [9]:
total = db.mozilla.count_documents({})

In [10]:
processed_bug_ids = set()
duplicated_bugs = set()
normal_bugs = set()

In [19]:
def preprocess(d, cols, date_lower=None, date_upper=None):
    global processed_bug_ids
    try:
        # Convert the datetime into python datetime object
        date = datetime.strptime(d['creation_ts'][:-6], 
                                 '%Y-%m-%d %H:%M:%S') # Without timezone

        # Filter the datetime if needed
        if date_lower:
            lDate = datetime.strptime(date_lower,
                                     '%Y-%m-%d')
        else:
            lDate = None

        if date_upper:
            uDate = datetime.strptime(date_upper,
                                     '%Y-%m-%d')
        else:
            uDate = None

        if lDate and uDate:
            if date < lDate or date > uDate:
                return None

        # Check if the description and short_description is empty
        if not d.get('description', False):
            return None
    
        if len(d['description'].strip()) == 0 or len(d['short_desc']) == 0:
            return None

        # Don't process the same bug_id multiple times
        if int(d['bug_id']) in processed_bug_ids:
            return None
        processed_bug_ids.add(int(d['bug_id']))

        # Log the bugs with duplicates
        if len(d['dup_id']) == 0:
            normal_bugs.add(int(d['bug_id']))
        else:
            duplicated_bugs.add(int(d['bug_id']))

        # Get the cleaned data
        cleaned_data = {}
        for col in cols:
            if col == 'bug_id':
                cleaned_data[col] = int(d[col])
            elif col == 'dup_id':
                if len(d[col]) == 0:
                    cleaned_data[col] = None
                else:
                    cleaned_data[col] = d[col]

            cleaned_data['date'] = date

            if col in ['short_desc', 'priority',
                      'component', 'bug_status',
                      'resolution', 'description']:
                cleaned_data[col] = d[col].strip().lower()
            else:
                cleaned_data[col] = d[col]
        return cleaned_data
    except KeyError as e1:
        _print(d)
        print('Keyerror', e1)
        return
    except AttributeError as e2:
        _print(d)
        print('AttributeError', e2)
        return

In [20]:
cols = ['bug_id', 'description', 'bug_severity',
       'dup_id', 'short_desc', 'priority', 'component', 'resolution', 
        'bug_status', 'resolution']
records = []
for doc in tqdm(db.mozilla.find({}), total=total):
    records.append(preprocess(doc, cols))

100%|██████████| 768335/768335 [00:26<00:00, 29095.84it/s]


In [34]:
finalDF = pd.DataFrame.from_records(list(filter(lambda x: x is not None, records)))

In [36]:
finalDF['dup_id'] = finalDF.astype(str)

In [37]:
finalDF.to_parquet('cleanedData.parquet.gzip', compression='gzip')

In [42]:
dump(processed_bug_ids, 'processed_bug_ids')
dump(duplicated_bugs, 'duplicated_bugs')
dump(normal_bugs, 'normal_bugs')

['normal_bugs']

In [43]:
!ls

 cleanedData.parquet.gzip   normal_bugs		     processed_bug_ids
 duplicated_bugs	   'Preprocess Data.ipynb'
