## Word2Vec Example - Comment letters

In [None]:
# based on: https://rare-technologies.com/word2vec-tutorial/
# further reading: Corpora and Vector Spaces - https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html

In [None]:
import os, string, glob, re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english') )

# add some punctuation to string.punctuation
punc = string.punctuation + '“”'

In [None]:
# need to do: pip install w3lib first
from w3lib.html import replace_entities

# function that converts html to text
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>', flags=re.DOTALL)
    cleantext = re.sub(cleanr, '', raw_html)
    return replace_entities(cleantext)

In [None]:
# generator
class myCommentLetters(object):
    
    def __init__(self, folder):
        self.folder = folder
 
    def __iter__(self):
        counter = 0
        # I usually read only a portion of the files when debugging        
        #for f in glob.glob('{}/*.txt'.format(self.folder))[0:100]:        
        # process all files
        for f in glob.glob('{}/*.txt'.format(self.folder)):  
            
            # read and tokenize file
            with open( f, encoding='utf-8') as f:
                content = cleanhtml( f.read() )
            # to have an idea of where we are
            counter += 1
            if counter % 50 == 0:
                print('counter', counter)
            # remove numbers, see https://stackoverflow.com/questions/57030670/how-to-remove-punctuation-and-numbers-during-tweettokenizer-step-in-nlp
            content = re.sub(r'\d+', '', content)
            # yield ('return') tokenized file
            yield [x.lower() for x in word_tokenize(content) if x.lower() not in stopWords and x not in string.punctuation]

In [None]:
# a memory-friendly iterator
wordLists = myCommentLetters(r'C:\Users\joost\Documents\teaching\acg7849-python\comment_letters') 

In [None]:
wordLists

In [None]:
import gensim
# note how worLists is passed as an argument
# This is typical for neural network algorithms: these expect something that is iterable
# can we pass in a list of all files? Sure
# But what if the files don't fit in memory? -> generator function that yields one file at a time
# min-count is how minimum count of each word, default is 5
# workers: #cores (may need cython to be installed for this to have an effect)
model = gensim.models.Word2Vec(wordLists, min_count=5, workers=8)

In [None]:
model

In [None]:
# raw NumPy vector of a word (needs to be in model)
# note we made all words lowercase
model.wv['dear']                  

In [None]:
# try similarity of: tax, liability, comment, deduction
model.wv.most_similar('accrual', topn=10) 

In [None]:
# which does not belong? (note: each of these should be in the model)
model.wv.doesnt_match("warranty liability claim contingency tax".split())