In [1]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
import math
import random
import seaborn as sns
import os
import collections
import smart_open
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', 200)

In [2]:
newmoz = mozilla = pd.read_csv('/Users/justinburack/Downloads/bugs-2019-10-02.csv')

In [3]:
mozilla = pd.read_csv('/Users/justinburack/Downloads/mozilla_firefox.csv')

mozilla = mozilla[~mozilla['Priority'].isin(['--'])]
priority = mozilla['Priority']

mozilla = pd.DataFrame(mozilla.loc[:,'Title':'Description'])
mozilla['Text'] = mozilla.loc[:,'Title'] + ' ' + mozilla.loc[:,'Description']
mozilla = mozilla[~mozilla['Text'].isin(['nan'])]

mozilla = pd.concat([mozilla, priority], axis=1)

In [4]:
mozilla.head()

Unnamed: 0,Title,Description,Text,Priority
0,Dialup properties needs to be exposed in prefs,The dialup properties of the profile should be exposed in the prefs panels so; the user has an easy way to modify them. The only other alternative would be to; make people go to the profile manag...,Dialup properties needs to be exposed in prefs The dialup properties of the profile should be exposed in the prefs panels so; the user has an easy way to modify them. The only other alternative w...,P3
3,Language encodings in font prefs dialog not sorted,Language encodings are listed in a seemingly random order.; The order be alphabetical (and therefore change with localization).; As a special case; User-Defined should be last.,Language encodings in font prefs dialog not sorted Language encodings are listed in a seemingly random order.; The order be alphabetical (and therefore change with localization).; As a special cas...,P3
4,Synaptics touchpad scrolling not working,From Bugzilla Helper:; User-Agent: Mozilla/5.0 (Windows; U; Win95; en-US; m18) Gecko/20001010; BuildID: 2000101014; ; I am using a synaptics touch pad with the latest Win 98 driver V 5.0.86 AUg...,Synaptics touchpad scrolling not working From Bugzilla Helper:; User-Agent: Mozilla/5.0 (Windows; U; Win95; en-US; m18) Gecko/20001010; BuildID: 2000101014; ; I am using a synaptics touch pad w...,P3
6,Cookie Manager: Dont allow sites that set removed cookies to set future cookies should stay checked/unchecked,From Bugzilla Helper:; User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 95); BuildID: 2001040404; ; The state of the checkbox Dont allow removed cookies to be accepted later in ; the cook...,Cookie Manager: Dont allow sites that set removed cookies to set future cookies should stay checked/unchecked From Bugzilla Helper:; User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 95); Bu...,P3
7,Today History folder should be expanded by default,Build: 2001052115; Mac OS 9.1; ; To reproduce:; * Open the History window.; * Look at the `Today folder.; ; What you should see:; * The folder should be expanded.; ; What you actually see:; ...,Today History folder should be expanded by default Build: 2001052115; Mac OS 9.1; ; To reproduce:; * Open the History window.; * Look at the `Today folder.; ; What you should see:; * The fol...,P4


In [5]:
mozilla.columns

Index(['Title', 'Description', 'Text', 'Priority'], dtype='object')

In [6]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatize(word):
    output = [token.lemma_ for token in nlp(word)]
    return output[0]

In [7]:
lemmatize('am')

'be'

In [8]:
def make_corpus(df):
    
    df = df[~df['Text'].isin(['nan'])]
    text_df = df['Text']
    #~df['Text'].isin(['nan'])]
    raw_text = list(text_df)
    print(raw_text[0])
    
    for item in raw_text:
        if type(item) == float:
            #NOTE: Keeping the 'nan's because there's a strong corellation
            #raw_text.remove(item)
            item = str(item)
    
    stopwords = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now', ',', ';', ';;', ':;']
    stopwords = set(stopwords)
    
    pattern1 = re.compile('[^A-Za-z0-9]([\w+\s]+)[^A-Za-z0-9]')
    pattern2 = re.compile('([\w+\s]+)[^A-Za-z0-9]')
    pattern3 = re.compile('[^A-Za-z0-9]([\w+\s]+)')
    
    sentence_list = []
    
    for sentence in raw_text[0:10000]:
        sentence = str(sentence)
        word_list = []
        sentence = sentence.strip()
        sentence = sentence.split()
        #print(sentence)
        for word in sentence:
            if len(word) > 2:
                if re.match(pattern1, word):
                    word = re.sub(pattern1, '\g<1>', word)
                elif re.match(pattern2, word):
                    word = re.sub(pattern2, '\g<1>', word)
                elif re.match(pattern3, word):
                    word = re.sub(pattern3, '\g<1>', word)
                if (word.lower() not in stopwords) and (word.isalpha() == True) and (word != ' '):
                    word = word.lower()
                    word = lemmatize(word)
                    word_list.append(word)
        sentence_list.append(word_list)
        #print(word_list)

    return sentence_list

In [9]:
def get_tf_idf_words(list):
    
    x = [item for sublist in list for item in sublist]
    
    counts = {}
    for word in x:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
            
    textunique = set(x)
    
    for key in counts:
        counts[key] /= len(textunique)
        
    TFWords = []
    for sentence in list:
        sentencedict = {}
        for word in sentence:
            sentencedict[word] = counts[word]
        TFWords.append(sentencedict)
        
    return TFWords

In [10]:
def filter_tfidf(list, float):
    
    TFWordsTotal = get_tf_idf_words(list)
    
    TFWordsTotalNew = []

    for document in TFWordsTotal:
        #print(document)
        document_dict = {}
        for word in document:
            #print(word)
            if document[word] >= float:
                document_dict[word] = document[word]
        TFWordsTotalNew.append(document_dict)
    return TFWordsTotalNew

In [11]:
def unpack_corpus(list):
    corpus = []
    for document in list:
        document = [k  for  k in  document]
        corpus.append(document)
    
    return corpus

In [12]:
def get_joined(list):
    corpus = []

    for item in list:
        item = ' '.join(item)
        corpus.append(item)
    return corpus

In [13]:
P1 = mozilla.loc[mozilla['Priority']=='P1']
P2 = mozilla.loc[mozilla['Priority']=='P2']
P3 = mozilla.loc[mozilla['Priority']=='P3']
P4 = mozilla.loc[mozilla['Priority']=='P4']
P5 = mozilla.loc[mozilla['Priority']=='P5']

In [14]:
P1_corpus = make_corpus(P1[:150])
P2_corpus = make_corpus(P2[:150])
P3_corpus = make_corpus(P3[:150])
P4_corpus = make_corpus(P4[:150])
P5_corpus = make_corpus(P5[:150])

Starting Customizing Toolbar makes the current page go blank [Build-ID: 2002-08-30-22]; ; As soon as I start Customize Toolbar; either by right-clicking on the toolbar or; by selecting it under the View menu; the current page goes blank (white).; ; Reproducable: always; ; Steps to reproduce:; ; 1) Go to a URL; 2) Open Customize Toolbar; ; Result: current page goes blank.
[FIX]Not correctly retrieving post data when saving a page or frame generated from a form POST From Bugzilla Helper:; User-Agent: Mozilla/5.0 (X11; U; Linux 2.4.5-1mdk i586; en-US; rv:0.9+); Gecko/20010604; BuildID:    2001060421; ; Mozilla does not save the respond to a posted form correctly.; Instead of saving the posted reply; it saves the form.; ; Reproducible: Always; Steps to Reproduce:; 1.go to the referenced form; 2.type in a URL in the big text box (say http://www.mozilla.org); 3.push dumplinks; 4.The form correct displays the page source; 5. Now try saving the output; 6. You get the form itself and not the re

In [15]:
mozilla = P1_corpus[:150] + P2_corpus[:150] + P3_corpus[:150] + P4_corpus[:150] + P5_corpus[:150]

In [16]:
joined_moz = get_joined(mozilla)

In [17]:
labels = ['P1', 'P2', 'P3', 'P4', 'P5']

keys = []
for item in labels:
    key = []
    for i in range(150):
        key.append(item)
        
    keys.append(key)
    
keys = [item for sublist in keys for item in sublist]

In [18]:
X = joined_moz
y = keys

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [20]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [21]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

In [22]:
def elmo_vectors(x):
    embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

    
    with tf.Session() as sess:
        try:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.tables_initializer())
            # return average of ELMo features
            return sess.run(tf.reduce_mean(embeddings,1))
        except ValueError:
            print('End of dataset')
            sess.close()

In [23]:
list_train = [X_train[i:i+10] for i in range(0,len(X_train),10)]

In [24]:
list_test = [X_test[i:i+10] for i in range(0,len(X_test),10)]

In [None]:
# Extract ELMo embeddings
elmo_train = [elmo_vectors(x) for x in list_train]

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [None]:
len(elmo_train[0]), len(elmo_train)

In [None]:
elmo_test = [elmo_vectors(x) for x in list_test]

In [None]:
len(elmo_test)

In [None]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)

In [None]:
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [None]:
# save elmo_train_new
pickle_out = open("elmo_train_03032019.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

In [None]:
# save elmo_test_new
pickle_out = open("elmo_test_03032019.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()