# Build Simple word2vec Model


* Code Sampled from [Laura K. Nelson's GitHub](https://github.com/lknelson/measuring_intersectionality/blob/main/scripts/00_measuringintersectionality_constructmodels.ipynb)

In [2]:
import sys, os, string, glob, gensim
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize

# Import parser module.
module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path + '//Scripts')

from functions_xml_ET_parse import *

# Declare absolute path.
abs_dir = "/Users/quinn.wi/Documents/"

# Define tokenizer.
def fast_tokenize(text):
    
    # Get a list of punctuation marks
    punct = string.punctuation + '“' + '”' + '‘' + "’"
    
    lower_case = text.lower()
    lower_case = lower_case.replace('—', ' ').replace('\n', ' ')
    
    # Iterate through text removing punctuation characters
    no_punct = "".join([char for char in lower_case if char not in punct])
    
    # Split text over whitespace into list of words
    tokens = no_punct.split()
    
    return tokens

## Build Dataframe from XML

In [18]:
%%time

"""
Declare variables.
"""

# Declare regex to simplify file paths below
regex = re.compile(r'.*/\d{4}/(.*)')

# Declare document level of file. Requires root starting point ('.').
doc_as_xpath = './/ns:div/[@type="entry"]'

# Declare date element of each document.
date_path = './ns:bibl/ns:date/[@when]'

# Declare person elements in each document.
person_path = './/ns:p/ns:persRef/[@ref]'

# Declare text level within each document.
text_path = './ns:div/[@type="docbody"]/ns:p'

"""
Build dataframe.
"""

dataframe = []

for file in glob.glob(abs_dir + 'Data/JQA/*/*.xml'):
#         Call functions to create necessary variables and grab content.
    root = get_root(file)
    ns = get_namespace(root)


    for eachDoc in root.findall(doc_as_xpath, ns):
#             Call functions.
        entry = get_document_id(eachDoc, '{http://www.w3.org/XML/1998/namespace}id')
        date = get_date_from_attrValue(eachDoc, date_path, 'when', ns)
        people = get_peopleList_from_attrValue(eachDoc, person_path, 'ref', ns)
        text = get_textContent(eachDoc, text_path, ns)

        dataframe.append([str(regex.search(file).groups()), entry, date, people, text])

dataframe = pd.DataFrame(dataframe, columns = ['file', 'entry', 'date', 'people', 'text'])

dataframe.head(4)

CPU times: user 621 ms, sys: 16.4 ms, total: 638 ms
Wall time: 649 ms


Unnamed: 0,file,entry,date,people,text
0,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-01,1825-01-01,,"1. VI:30. H. Humphreys here, for Methodist Chu..."
1,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-02,1825-01-02,,2. VII:15— Heard Lynde at the Capitol—late. Ca...
2,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-03,1825-01-03,,3. VII. I called at M. Van-Buren’s lodgings—ou...
3,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-04,1825-01-04,,4. VI:30. W. Findlay here; Statesman Newspaper...


## Build w2v Model

In [28]:
%%time

# Convert dataframe text field to list of sentences.
sentences = [sentence for text in dataframe['text'] for sentence in sent_tokenize(text)]
words_by_sentence = [fast_tokenize(sentence) for sentence in sentences]
words_by_sentence = [sentence for sentence in words_by_sentence if sentence != []]

# Get total number of words and unique words.
single_list_of_words = []
for l in words_by_sentence:
    for w in l:
        single_list_of_words.append(w)
print (f'Word total: {len(single_list_of_words)}\nUnique word total {len(set(single_list_of_words))}')

# Build model.
model = gensim.models.Word2Vec(words_by_sentence, window=5, vector_size=100,
                               min_count=10, sg=1, alpha=0.025, batch_words=10000, workers=4)

# Unused arguments:
# size=100, iter=5,

# Save model for later use
model.wv.save_word2vec_format(abs_dir + '/Data/Output/WordVectors/jqa_w2v.txt')

Word total: 707450
Unique word total 19664
CPU times: user 14.1 s, sys: 149 ms, total: 14.2 s
Wall time: 5.21 s
