In [1]:

f = open("/home/joe0400/Downloads/JDT_Bugs_sm.csv")

# clean function
import html
import re
def clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text) 
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

#textaciy normalization funciton
import textacy
import textacy.preprocessing as tprep
from sklearn.preprocessing import normalize as normalize_vector


if textacy.__version__ < '0.11':
    def normalize(text):
        text = tprep.normalize_hyphenated_words(text)
        text = tprep.normalize_quotation_marks(text)
        text = tprep.normalize_unicode(text)
        text = tprep.remove_accents(text)
        return text
else:
    # adjusted to textacy 0.11. Note, function names are changed
    def normalize(text):
        text = tprep.normalize.hyphenated_words(text)
        text = tprep.normalize.quotation_marks(text)
        text = tprep.normalize.unicode(text)
        text = tprep.remove.accents(text)
        return text

#spacy lemmatization using the snippet
import spacy
from nltk.corpus import wordnet
#wanted word function
def wanted_word(token):
    return wordnet.synsets(str(token))
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler","ner"])
#modified so that it returms the doc and lemmas for the dataframe
def process_using_spacy(text):
    doc = nlp(text) 
    lemmas = " ".join(token.lemma_ for token in doc if wanted_word(str(token)))
    return ' '.join([str(d) for d in doc]), lemmas

from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
import csv
# this is the csv getting loaded
table = [[process_using_spacy(normalize(clean(col)))[0 if i != 2 or j == 0 else 1] for i, col in enumerate(row) ] for j, row in enumerate(csv.reader(f))]



In [3]:
#grabbing the rows and vals for reformatting the table
table_rows = table[0]
table_vals = table[1:]

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform([v[2] for v in table_vals])
classes = {'APT', 'Core', 'Debug', 'Doc', 'Text', 'UI'}

In [4]:
# the reason for tfidfs normalization is that it makes it 
# it into cosine similarity for scikit learns implementation
tfidifs = normalize_vector(X.toarray())

In [5]:
import pandas
#making the table for pands to load
table_generated = {col:[] for col in table_rows}
table_generated['tf-idf'] = []
table_generated['id'] = []
for each in tfidifs:
    table_generated['tf-idf'].append(each)

for row in table_vals:
    for i in range(len(row)):
        table_generated[table_rows[i]].append(row[i])
    for j, each in enumerate(classes):
        if(each == row[1]):
            table_generated['id'].append(j)


In [6]:
dataframe = pandas.DataFrame(table_generated)
#pandas loading the table

In [7]:
dataframe

Unnamed: 0,Unnamed: 1,Component,bug report,tf-idf,id
0,0,APT,exception thrown reconcile got following excep...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
1,1,APT,bug getting error trying install installing ne...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
2,2,APT,unable load factory names container i have an ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
3,3,APT,headless build fails integrated external tool ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
4,4,APT,error type detection is too conservative compi...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
...,...,...,...,...,...
2638,2638,UI,trying externalize strings i got following i t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
2639,2639,UI,java search package has no plug in export see ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
2640,2640,UI,rename inner type does not update constructor ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
2641,2641,UI,disable operation logical packages in i m not ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0


In [8]:
classes = set(dataframe["Component"])

In [9]:
classes

{'APT', 'Core', 'Debug', 'Doc', 'Text', 'UI'}

In [10]:
random_class_selection = [dataframe[dataframe['id'] == classif].sample(1) for classif in set(dataframe['id'])]

In [11]:
random_choices = [(sel['tf-idf'].values[0], sel['id'].values[0]) for sel in random_class_selection]

In [22]:
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

kmeans = LatentDirichletAllocation(n_components=6)
tfs = np.asarray([each for each in dataframe['tf-idf']])
clusters = kmeans.fit(tfs,[each for each in dataframe['id']]).transform(tfs)

In [17]:
#predictions = kmeans.fit_transform([each for each in dataframe['tf-idf']])

In [18]:
#predictions

In [29]:
clusters

array([[0.35477576, 0.04152094, 0.0419971 , 0.04154068, 0.04282342,
        0.4773421 ],
       [0.02805372, 0.02818614, 0.02803746, 0.02803753, 0.0280375 ,
        0.85964765],
       [0.39515756, 0.0446442 , 0.04446901, 0.04469473, 0.04446932,
        0.42656518],
       ...,
       [0.03528144, 0.03552077, 0.03525875, 0.03525881, 0.03525878,
        0.82342144],
       [0.0199437 , 0.01995976, 0.01993155, 0.01993194, 0.01993174,
        0.90030131],
       [0.03549876, 0.03600299, 0.03547979, 0.03547984, 0.03547982,
        0.8220588 ]])

In [26]:
confusion_matrix = {a:[0] * 6 for a in range(6)}

In [30]:
for a,b in zip(clusters,dataframe['id']):
    confusion_matrix[b][a.argmax()] += 1


In [31]:
pandas.DataFrame(confusion_matrix).iloc[[0,1,2,3,4,5]] # to show the confusion matrix.

Unnamed: 0,0,1,2,3,4,5
0,79,21,59,9,3,25
1,24,17,3,5,38,13
2,1,0,1,0,0,0
3,0,3,0,0,1,0
4,2,1,1,0,0,0
5,767,362,590,106,146,366


In [32]:
#  it seems kinda bad, as it tends to group in two columns
# now onto making it work with SVD
from sklearn.decomposition import TruncatedSVD


kmeans = TruncatedSVD(n_components=6)
tfs = np.asarray([each for each in dataframe['tf-idf']])
clusters = kmeans.fit(tfs,[each for each in dataframe['id']]).transform(tfs)



In [33]:
confusion_matrix = {i:[0] * 6 for i in range(6)}

In [36]:
for a,b in zip(clusters,dataframe['id']):
    confusion_matrix[b][a.argmax()] += 1


In [37]:
pandas.DataFrame(confusion_matrix)

Unnamed: 0,0,1,2,3,4,5
0,117,63,98,19,16,46
1,692,303,470,96,169,335
2,29,7,72,3,0,8
3,15,1,5,0,0,0
4,3,24,1,0,0,0
5,17,6,8,2,3,15


In [None]:
# SVD's arent great either
# we can see that with how its all grouped at the top. 