In [None]:
import xml.etree.cElementTree as et
from bs4 import BeautifulSoup
from nltk.corpus import PlaintextCorpusReader, stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from collections import defaultdict
from gensim import corpora, utils
from gensim.corpora import Dictionary
from smart_open import open
import os

In [None]:
# the path of xml file, in this case I am using the Posts.xml
source_path = 'C:\\Users\\Terolli\\Documents\\Research Paper\\stackexchange\\stackoverflow.com-Posts\\Posts.xml'
destination_path = 'C:\\Users\\Terolli\\Documents\\Research Paper\\stackexchange\\stackoverflow.com-Posts\\'

In [None]:
# extracts documents not older than start_date (yyyy-mm)
def extract_doc(start_date):
    
    prev = ['2008']
    for event, elem in et.iterparse(source_path):

        # used to check the progress of xml iteration
        try:
            current = elem.attrib.get('CreationDate')[:4]
        except Exception as e:
            print(e)

        if current not in prev:
            print(current)
            prev.append(current)

        # extracting document if not older than start_date
        if start_date < elem.attrib.get('CreationDate'):

            # getting the body of xml, which contains only the post with no metadata
            soup = BeautifulSoup(elem.attrib.get('Body'), 'html.parser')

            # separating natural language text
            nlt_string = ''
            for para in soup.find_all('p'):
                nlt_string += para.get_text() + '\n'

            # separating code
            code_string = ''
            for code in soup.find_all('code'):
                code_string += code.get_text() + '\n'

            # writing each post as 2 separate .txt files, one for natural language and the other for code
            with open(destination_path + 'nlt\\nlt_' + elem.attrib.get('Id') + '.txt', 
                      'w+', encoding="utf-8") as nlt_file:
                if nlt_string != '':
                    try:
                        nlt_file.write(nlt_string)
                    except Exception as e:
                        print(e)
                        
            with open(destination_path + 'code\\code_'+ elem.attrib.get('Id') + '.txt', 
                      'w+', encoding="utf-8") as code_file:
                if code_string != '':
                    try:
                        code_file.write(code_string)
                    except Exception as e:
                        print(e)

            nlt_file.close()
            code_file.close()

        # removing xml element from RAM
        elem.clear()

In [None]:
# preprocesses the extracted natural language texts
def preprocess(directory):
    counter = 0;
    for dirpath, dirs, filenames in os.walk(directory):
        for file in filter(lambda file: file.endswith('.txt'), filenames):
            try:
                counter += 1
                # convert txt file into continuous str
                document = open(os.path.join(dirpath, file), encoding="utf8").read()

                # convert to lower case
                document = document.lower()

                # tokenize document
                tk = RegexpTokenizer(r'[a-zA-Z]+')
                tokens = [token for token in tk.tokenize(document)]
                
                # lemmatize doc
                lemma = WordNetLemmatizer()
                tokens = [lemma.lemmatize(token) for token in tokens]
                
                # determine stop words
                stoplist = set(stopwords.words('english'))
                
                # remove stop words
                tokens = [token for token in tokens if token not in stoplist]

                # remove words with length 1
                tokens = [token for token in tokens if len(token) > 1]

                # remove words with frequency == 1
                frequency = defaultdict(int)
                for token in tokens:
                    frequency[token] += 1
                tokens = [token for token in tokens if frequency[token] > 1]

                # track progress
                if counter%1000 == 0: print(counter)
                    
                yield tokens
                
            except Exception as e:
                print(e)

In [None]:
# creates the corpus from the preprocessed documents
class The_Corpus(object):

    def __init__(self, dir):
        self.dir = dir
        self.dictionary = Dictionary(preprocess(dir))
        self.dictionary.filter_extremes(no_below=20, no_above=0.4)
        self.dictionary.compactify()
        self.dictionary.save(destination_path + 'dictionary.dict')

    def __iter__(self):
        for tokens in preprocess(self.dir):
            yield self.dictionary.doc2bow(tokens)

    def __len__(self):
        return len(self.dictionary)

In [None]:
# extracting the last 3 full months
# last dataset update: 08-Sep-2020
extract_doc('2020-06')

In [None]:
# saving the corpus
corpora.MmCorpus.serialize(destination_path + 'the_corpus.mm', The_Corpus(destination_path + 'nlt'))