# Doc2Vec

In [1]:
import sys, os, string, glob, gensim
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize

import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1 # This will be painfully slow otherwise
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess


# Import parser module.
module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path + '//Scripts')

from functions_xml_ET_parse import *

# Declare absolute path.
abs_dir = "/Users/quinn.wi/Documents/"

# Define tokenizer.
def fast_tokenize(text):
    
    # Get a list of punctuation marks
    punct = string.punctuation + '“' + '”' + '‘' + "’"
    
    lower_case = text.lower()
    lower_case = lower_case.replace('—', ' ').replace('\n', ' ')
    
    # Iterate through text removing punctuation characters
    no_punct = "".join([char for char in lower_case if char not in punct])
    
    # Split text over whitespace into list of words
    tokens = no_punct.split()
    
    return tokens

## Build Dataframe from XML

In [2]:
%%time

"""
Declare variables.
"""

# Declare regex to simplify file paths below
regex = re.compile(r'.*/.*/(.*.xml)')

# Declare document level of file. Requires root starting point ('.').
doc_as_xpath = './/ns:div/[@type="entry"]'

# Declare date element of each document.
date_path = './ns:bibl/ns:date/[@when]'

# Declare person elements in each document.
person_path = './/ns:p/ns:persRef/[@ref]'

# Declare subject elements in each document.
subject_path = './/ns:bibl//ns:subject'

# Declare text level within each document.
text_path = './ns:div/[@type="docbody"]/ns:p'

"""
Build dataframe.
"""

dataframe = []

for file in glob.glob(abs_dir + 'Data/PSC/JQA/*/*.xml'):
    reFile = str(regex.search(file).group(1))
#         Call functions to create necessary variables and grab content.
    root = get_root(file)
    ns = get_namespace(root)

    for eachDoc in root.findall(doc_as_xpath, ns):
#         Call functions.
        entry = get_document_id(eachDoc, '{http://www.w3.org/XML/1998/namespace}id')
        date = get_date_from_attrValue(eachDoc, date_path, 'when', ns)
        people = get_peopleList_from_attrValue(eachDoc, person_path, 'ref', ns)
        subject = get_subject(eachDoc, subject_path, ns)
        text = get_textContent(eachDoc, text_path, ns)

        dataframe.append([reFile, entry, date, people, subject, text])

dataframe = pd.DataFrame(dataframe, columns = ['file', 'entry', 'date', 
                                               'people', 'subject', 'text'])

# Split subject list and return "Multiple-Subject" or lone subject.
dataframe['subject'] = dataframe['subject'].str.split(r',')

def handle_subjects(subj_list):
    if len(subj_list) > 1:
        return 'Multiple-Subjects'
    else:
        return subj_list[0]
    
dataframe['subject'] = dataframe['subject'].apply(handle_subjects)

dataframe.head(4)

CPU times: user 3.97 s, sys: 82.9 ms, total: 4.05 s
Wall time: 4.1 s


Unnamed: 0,file,entry,date,people,subject,text
0,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-01,1808-08-01,"courtdegebelin-antoine,gregory-george,rousseau...",Recreation,"1. Bathed with George this morning, at the pla..."
1,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-02,1808-08-02,"degrand-peter,everett-alexander",Recreation,"2. Bathed again this Morning, and took George ..."
2,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-03,1808-08-03,"degrand-peter,welsh-thomas,davis-john,dawes-th...",Recreation,"3. Bathed this morning, at 6. with Mr: De Gran..."
3,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-04,1808-08-04,"boylston-ward,degrand-peter,adams-louisa-cathe...",Recreation,"4. Mr: Boylston called for me by appointment, ..."


## Build doc2vec Model

In [3]:
%%time

# Create corpus.
tagged_docs = dataframe \
    .apply(lambda x: TaggedDocument(simple_preprocess(x.text),
                                    [f'{x.entry}']
#                                    ['doc{}',format(x.entry)]
                                   ), 
           axis = 1)

training_corpus = tagged_docs.values

# Training
model = Doc2Vec(vector_size = 200, min_count = 4, epochs = 10)

model.build_vocab(training_corpus)

model.train(training_corpus, 
            total_examples = model.corpus_count, 
            epochs = model.epochs)

# Store model.
model.save(abs_dir + 'Data/Output/WordVectors/jqa-d2v.txt')

CPU times: user 1min 23s, sys: 2.28 s, total: 1min 25s
Wall time: 38.3 s
