In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./data/cleaned_data.csv")
df.head()

Unnamed: 0,full_text,aac,aapt,aar,abaddressbook,abi,abort,absolutepath,absolutevalue,abstract,...,zooming,zorder,zpl,zplii,zsh,zshcompletion,zshrc,zsi,zurbfoundation,zxing
0,Pan & Zoom Image\nI want to create a simple im...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,Is a bool read/write atomic in C#\nIs accessin...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,What is the advantage of storing schema in avr...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,How do you get JavaScript/jQuery Intellisense ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,XmlSerializer - There was an error reflecting ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
def get_top_n(dataframe, top_n):
    """
    Get the top n of the created 
    
    Parameters
    -----------
    dataframe : pandas.DataFrame
        input dataframe
    top_n : int,
        number of desired category
        
    Return
    -------
        pandas.DataFrame with only the top_n of the desired categories
    """
    categories = list(dataframe.columns.values)[1:]
    df_stats = pd.DataFrame([(category, dataframe[category].sum()) for category in categories], columns=['category', 'number of queries'])
    df_stats.sort_values(by=['number of queries'], ascending = False, inplace=True)
    
    top_ = df_stats['category'][:top_n].tolist()
    df_top = dataframe[['full_text'] + top_]
    # Remove raws without labels
    df_top = df_top[(df_top[top_].T != 0).any()]
    df_top = df_top.reset_index(drop=True)
    return df_top, top_

In [4]:
import spacy
import nltk
from spacy.lang.en import English
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')

en_stop = set(nltk.corpus.stopwords.words('english'))

parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    
    for token in tokens:
        if token.orth_.isspace():
            continue
            
        elif token.like_url:
            lda_tokens.append('URL')
            
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
            
        else:
            lda_tokens.append(token.lower_)
            
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

2022-08-16 12:26:18.832464: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-16 12:26:18.832510: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
num_labels = 10
df_10, label_cols = get_top_n(df, num_labels)

In [6]:
df_10

Unnamed: 0,full_text,java,python,javascript,ios,android,iphone,objectivec,html,jquery,php
0,What is the advantage of storing schema in avr...,1,0,0,0,0,0,0,0,0,0
1,How do you get JavaScript/jQuery Intellisense ...,0,0,1,0,0,0,0,0,1,0
2,How can I prevent SQL injection in PHP?\nIf us...,0,0,0,0,0,0,0,0,0,1
3,Mocking Static Blocks in Java\nMy motto for Ja...,1,0,0,0,0,0,0,0,0,0
4,Getting random row through SQLAlchemy\nHow do ...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
14734,How to map numeric data into categories / bins...,0,1,0,0,0,0,0,0,0,0
14735,Override S3 endpoint using Boto3 configuration...,0,1,0,0,0,0,0,0,0,0
14736,Is there any way to convert ZoneId to ZoneOffs...,1,0,0,0,0,0,0,0,0,0
14737,iOS 9 Splash screen is black\nMy apps' splash ...,0,0,0,1,0,0,0,0,0,0


In [7]:
sentences = df_10.full_text.tolist()
sentences[:5]

['What is the advantage of storing schema in avro?\nWe need to serialize some data for putting into solr as well as hadoop. I am evaluating serialization tools for the same. The top two in my list are Gson and Avro. As far as I understand, Avro = Gson + Schema-In-JSON If that is correct, I do not see why Avro is so popular for Solr/Hadoop? I have searched a lot on the Internet, but cannot find a single correct answer for this. Everywhere it says, Avro is good because it stores schema. My question is what to do with that schema? It may be good for very large objects in Hadoop where a single object is stored in multiple file blocks such that storing schema with each part helps to analyze it better. But even in that case, schema can be stored separately and just a reference to that is sufficient to describe the schema. I see no reason why schema should be part of each and every piece. If someone can give me some good use case how Avro helped them and Gson/Jackson were insufficient for the

In [8]:
from tqdm.notebook import trange, tqdm

In [9]:
import tomotopy as tp
import sys

def lda_example(sentences, save_path):
    mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5, k=20)
    
    for line in tqdm(sentences):
        tokens = prepare_text_for_lda(line)
        if tokens:
            if len(tokens) > 4:
                mdl.add_doc(tokens)
                
    mdl.burn_in = 100
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    for i in range(0, 1000, 10):
        mdl.train(10)
        if i%200 == 0:
            print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

    print('Saving...', file=sys.stderr, flush=True)
    mdl.save(save_path, True)

    for k in trange(mdl.k):
        print('Topic #{}'.format(k))
        for word, prob in mdl.get_topic_words(k):
            print('\t', word, prob, sep='\t')

In [10]:
lda_example(sentences, './data/LDAmodel_lvl_q_10')

  0%|          | 0/14739 [00:00<?, ?it/s]

Training...


Num docs: 14714 , Vocab size: 24046 , Num words: 881353
Removed top words: ['using', 'android', 'SCREEN_NAME', 'return', 'class']
Iteration: 0	Log-likelihood: -8.857233371335994
Iteration: 200	Log-likelihood: -7.796527597733532
Iteration: 400	Log-likelihood: -7.760030525784211
Iteration: 600	Log-likelihood: -7.746214874231046
Iteration: 800	Log-likelihood: -7.738771447640336


Saving...


  0%|          | 0/20 [00:00<?, ?it/s]

Topic #0
		controller	0.018855147063732147
		uiview	0.01621755212545395
		nsstring	0.014807353727519512
		alloc	0.013109891675412655
		swift	0.0106812147423625
		constraint	0.008487571962177753
		button	0.0076257833279669285
		method	0.007599668577313423
		animate	0.007521324325352907
		custom	0.007234061136841774
Topic #1
		thread	0.05264892056584358
		process	0.019311979413032532
		block	0.017606765031814575
		threads	0.016413114964962006
		queue	0.01624259352684021
		function	0.013556880876421928
		method	0.01308794692158699
		event	0.011681145057082176
		async	0.010871168226003647
		request	0.010018561035394669
Topic #2
		react	0.03485853224992752
		component	0.025815188884735107
		const	0.02312958799302578
		import	0.022855548188090324
		error	0.022417083382606506
		function	0.021375728771090508
		module	0.016278570517897606
		router	0.011839108541607857
		export	0.011619876138865948
		route	0.011345835402607918
Topic #3
		jquery	0.034686580300331116
		input	0.027359850704669952
	

In [None]:
# Use tags : associate with topics 

In [10]:
data_ready = [prepare_text_for_lda(line) for line in sentences]
data_ready[:5]

[['advantage',
  'store',
  'schema',
  'serialize',
  'putting',
  'hadoop',
  'evaluate',
  'serialization',
  'tool',
  'understand',
  'schema',
  'correct',
  'popular',
  'hadoop',
  'search',
  'internet',
  'single',
  'correct',
  'answer',
  'everywhere',
  'store',
  'schema',
  'question',
  'schema',
  'large',
  'object',
  'hadoop',
  'single',
  'object',
  'store',
  'multiple',
  'block',
  'store',
  'schema',
  'help',
  'analyze',
  'better',
  'schema',
  'store',
  'separately',
  'reference',
  'sufficient',
  'describe',
  'schema',
  'reason',
  'schema',
  'every',
  'piece',
  'someone',
  'help',
  'jackson',
  'insufficient',
  'purpose',
  'would',
  'really',
  'helpful',
  'official',
  'documentation',
  'schema',
  'produce',
  'schema+data',
  'question',
  'schema',
  'input',
  'output',
  'along',
  'representation',
  'extra',
  'achieve',
  'serialize',
  'object',
  'using',
  'add',
  'input',
  'schema',
  'calling',
  'really',
  'confuse'],

In [11]:
from utils.model import load_zeroshot_model

In [12]:
# the saved model is HDP model, 
# so when you load it by LDA model, it will raise an exception
mdl = tp.LDAModel.load('./data/LDAmodel_lvl_q_10')

In [13]:
def predict_topic(sentence):
    doc = prepare_text_for_lda(sentence)
    doc_inst = mdl.make_doc(doc)
    topic_dist, ll = mdl.infer(doc_inst)
    return topic_dist, ll

In [14]:
classifier = load_zeroshot_model()

def _classifier(texts, candidate_labels):
    if isinstance(texts, str):
        texts = [texts]
    return_data = []
    for text in texts:
        resp = classifier(text, candidate_labels=candidate_labels)
        if resp['scores'][0] >= 0.45:
            return_data.append(resp['labels'][0])
        else:
            return_data.append("not found")
    return return_data

In [15]:
list_kw = list()

for t in trange(mdl.k):
    topic = mdl.get_topic_words(t, top_n=50)
    list_kw.append(' '.join([word for word, prob in topic]))

  0%|          | 0/20 [00:00<?, ?it/s]

In [16]:
list_kw[:5]

['controller uiview nsstring alloc swift constraint button method animate custom delegate try navigation storyboard xcode nsarray uitableview uiimage uiviewcontroller frame tableview animation create viewcontroller property following nsdata uicolor nsdictionary add uilabel completion override layer viewdidload super uibutton release present subclass view context section error indexpath uiscrollview programmatically nsurl objective uiimageview',
 'thread process block threads queue function method event async request running call execute worker start callback task second asynchronous output finish python subprocess debug(524 await import stdout multiprocessing result blocking execution create future message connection synchronize parallel object update write system processing stderr operation command /system program multiple signal print',
 'react component const import error function module router export route testing angular webpack promise render failure update context alert express 

In [17]:
classification = _classifier(list_kw, label_cols)
classification

['objectivec',
 'python',
 'not found',
 'jquery',
 'javascript',
 'not found',
 'iphone',
 'objectivec',
 'not found',
 'python',
 'iphone',
 'ios',
 'not found',
 'iphone',
 'python',
 'not found',
 'not found',
 'not found',
 'python',
 'python']

In [22]:
#list_kw[0] = "Swift"
# list_kw[1] = "Python subprocess"
# list_kw[2] web dev
# list_kw[3] : jquery, javascript
# list_kw[4] : javascript web dev
#list_kw[5]  : JAXB javascript
#list_kw[6] : mobile dev
#list_kw[7] : Java
#list_kw[8], classification[8] #webframework
# list_kw[9] : python
import pandas as pd
#pd.set_option('display.width', None)
pd.reset_option("all")
result = pd.DataFrame(list(zip(list_kw, classification)), columns = ['LDA keywords', 'Classification ZeroShot'])
result

As the xlwt package is no longer maintained, the xlwt engine will be removed in a future version of pandas. This is the only engine in pandas that supports writing in the xls format. Install openpyxl and write to an xlsx file instead.

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



Unnamed: 0,LDA keywords,Classification ZeroShot
0,controller uiview nsstring alloc swift constra...,objectivec
1,thread process block threads queue function me...,python
2,react component const import error function mo...,not found
3,jquery input button function javascript click ...,jquery
4,server browser file javascript upload client d...,javascript
5,public spring private method property static s...,not found
6,device iphone application message notification...,iphone
7,project error build xcode compile version libr...,objectivec
8,database model table query object entity creat...,not found
9,python import package error install module com...,python


In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=mdl, corpus=corpus, texts=data_ready)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
for k in trange(mdl.k):
    print('Topic #{}'.format(k))
    for word, prob in mdl.get_topic_words(k):
        print('\t', word, prob, sep='\t')