# Stack Overflow Tag Imputation using Multi-label Classification

## Imports

In [87]:
# data structures
import pandas as pd
from tqdm import tqdm, tqdm_pandas
from collections import Counter

# iterators
import itertools

# text
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Settings

In [63]:
# download
nltk.download('stopwords')
nltk.download('punkt')

# create and register a new `tqdm` instance with `pandas`
tqdm_pandas(tqdm())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maksi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maksi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!




0it [00:00, ?it/s]



## Data Preprocessing

In [64]:
# to build and traing our model, we are going to have to work with posts have labeled tags
# posts = pd.read_csv("../../../../160-Stackoverflow-Data/300000_rows/Posts.csv", 
#                     dtype={'LastEditorDisplayName': str})
posts = pd.read_csv("../../../../160-Stackoverflow-Data/100000_rows/Posts.csv", 
                    dtype={'LastEditorDisplayName': str})
sub_posts = posts[['Id', 'Body', 'Title', 'Tags']]
sub_posts = sub_posts[~sub_posts['Tags'].isnull()]

In [65]:
print(sub_posts.shape)
sub_posts.head()

(50053, 4)


Unnamed: 0,Id,Body,Title,Tags
0,49104125,<p>Am new to jquery. \r\n I try to create m...,down caret rotating not working,<jquery><css><asp.net><html5>
1,49104126,<p>We are working on rewrite of an existing ap...,Parallel job execution with split-and-aggregat...,<java><multithreading><jobs>
2,49104127,"<p>In Graph Explorer, when logged in as a user...","Microsoft Graph: ""Unknown Error"" when calling ...",<azure><active-directory><microsoft-graph><msal>
6,49104131,<p>I'm currently using Google Drive SDK in my ...,Class implemented in both Derived Data and App...,<ios><swift><xcode><google-drive-sdk>
7,49104132,"<p>I have 3 Spinners citySpinner, regionSpinne...",How to filter spinner based on other spinner s...,<android><json>


## Data Cleaning

The body of our dataframe is encoded in html. Additionally, we will have to remove stop words.

In [66]:
# characters to replace by space (for html)
BRACKETS_RE = re.compile('[/(){}\[\]\|@,;]') 

# identify irrelevent characters
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

# insignificant words
STOPWORDS = set(stopwords.words('english'))

def clean_and_tokenize_body(text):
    """
    @param text: str - raw string
    return: str - filtered, and tokenized string
    """
    text = re.sub(BRACKETS_RE, ' ', text.lower())
    text = re.sub(BAD_SYMBOLS_RE, '', text)
    return ' '.join([w for w in word_tokenize(text) if not w in STOPWORDS])


# sample function test
examples = ["SQL Server -{any equivalent of Excel's} CHOOSE function?",
            "How to free c++ memory vector<int> * (arr)?"]
for e in examples:
    print(clean_and_tokenize_body(e))

sql server equivalent excels choose function
free c++ memory vectorint arr


In [67]:
# now apple on the main dataframe
# progress_apply is an alternative to apply offered by tqdm
sub_posts['Body'] = sub_posts['Body'].apply(lambda t: clean_and_tokenize_body(t))
sub_posts['Title'] = sub_posts['Title'].apply(lambda t: clean_and_tokenize_body(t))
sub_posts['Tags'] = sub_posts['Tags'].apply(lambda t: ' '.join(re.findall(r"\<(\w+)\>", t)))

In [68]:
sub_posts.head()

Unnamed: 0,Id,Body,Title,Tags
0,49104125,pam new jquery try create menu arrow click nee...,caret rotating working,jquery css html5
1,49104126,pwe working rewrite existing application need ...,parallel job execution splitandaggregate java,java multithreading jobs
2,49104127,pin graph explorer logged user application use...,microsoft graph unknown error calling contacts,azure msal
6,49104131,pim currently using google drive sdk ios app x...,class implemented derived data application ios...,ios swift xcode
7,49104132,pi 3 spinners cityspinner regionspinner branch...,filter spinner based spinner selection,android json


# Building a Bag of Words Model

We will build a set of vectors $[i_0, i_1, i_2, \dots, i_k]$ where $k$ is an integer that maps to the word corpus and $i_k$ represents the frequency of the mapped word for each post. 

In [83]:
# first count all the words globally
word_count = Counter()
for body in sub_posts['Body']:
    word_count.update(Counter(body.split()))

# lets give titles twice the weight because they are more important
for title in sub_posts['Title']:
    word_count.update(Counter(title.split() + title.split()))

In [98]:
# how many unique tags are there?
tags_joined = list(itertools.chain.from_iterable([tag.split() for tag in sub_posts['Tags']]))
n_unique_tags = set(tags_joined)
print('Total number of unique tags', len(n_unique_tags))
print('Ration: unique tag to total unique words', len(n_unique_tags)/len(tags_joined))

Total number of unique tags 7805
Ratio unique tag:total unique words 0.07616491827274945


In [96]:
# sort the words by frequency and limit to the number of tags we have in total
most_common_words = sorted(word_count.items(), key=lambda x: -x[1])[:len(n_unique_tags)]

# print the first few as an example
count = 1
for key, value in most_common_words:
    if count < 5:
        print(key, ':', value)
        count += 1
    else:
        break

lt : 163657
code : 147541
gt : 61111
p : 49782
