# Doc2Vec

In [11]:
import sys, os, string, glob, gensim
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize

import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1 # This will be painfully slow otherwise
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess


# Import parser module.
module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path + '//Scripts')

from functions_xml_ET_parse import *

# Declare absolute path.
abs_dir = "/Users/quinn.wi/Documents/"

# Define tokenizer.
def fast_tokenize(text):
    
    # Get a list of punctuation marks
    punct = string.punctuation + '“' + '”' + '‘' + "’"
    
    lower_case = text.lower()
    lower_case = lower_case.replace('—', ' ').replace('\n', ' ')
    
    # Iterate through text removing punctuation characters
    no_punct = "".join([char for char in lower_case if char not in punct])
    
    # Split text over whitespace into list of words
    tokens = no_punct.split()
    
    return tokens

## Build Dataframe from XML

In [2]:
%%time

"""
Declare variables.
"""

# Declare regex to simplify file paths below
regex = re.compile(r'.*/.*/(.*.xml)')

# Declare document level of file. Requires root starting point ('.').
doc_as_xpath = './/ns:div/[@type="entry"]'

# Declare date element of each document.
date_path = './ns:bibl/ns:date/[@when]'

# Declare person elements in each document.
person_path = './/ns:p/ns:persRef/[@ref]'

# Declare text level within each document.
text_path = './ns:div/[@type="docbody"]/ns:p'

"""
Build dataframe.
"""

dataframe = []

for file in glob.glob(abs_dir + 'Data/JQA/*/*.xml'):
    reFile = str(regex.search(file).group(1))
#         Call functions to create necessary variables and grab content.
    root = get_root(file)
    ns = get_namespace(root)

    for eachDoc in root.findall(doc_as_xpath, ns):
#             Call functions.
        entry = get_document_id(eachDoc, '{http://www.w3.org/XML/1998/namespace}id')
        date = get_date_from_attrValue(eachDoc, date_path, 'when', ns)
        people = get_peopleList_from_attrValue(eachDoc, person_path, 'ref', ns)
        text = get_textContent(eachDoc, text_path, ns)

        dataframe.append([reFile, entry, date, people, text])

dataframe = pd.DataFrame(dataframe, columns = ['file', 'entry', 'date', 'people', 'text'])

dataframe.head(4)

CPU times: user 1.92 s, sys: 81.6 ms, total: 2.01 s
Wall time: 2.21 s


Unnamed: 0,file,entry,date,people,text
0,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-01,1817-10-01,"sullivan-john,coleman-unknown,divoff-unknown,b...",1. IV:30. Wednesday. Wrote a Letter to J. L. S...
1,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-02,1817-10-02,"delaplaine-joseph,waterhouse-benjamin,morris-c...",2. IV: Continued drafting instructions for Rus...
2,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-03,1817-10-03,"harris-levett,nourse-joseph,correa-joseph,jeff...",3. IV: I had visits this morning from Mr Levet...
3,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-04,1817-10-04,"hyde-de-neuville-jean,tingey-thomas,cardelli-p...",4. IV: I waked before three and had afterwards...


## Build doc2vec Model

In [14]:
%%time

# Create corpus.
tagged_docs = dataframe \
    .apply(lambda x: TaggedDocument(simple_preprocess(x.text),
                                   ['doc{}',format(x.entry)]), axis = 1)

training_corpus = tagged_docs.values

# Training
model = Doc2Vec(vector_size = 200, min_count = 4, epochs = 10)

model.build_vocab(training_corpus)

model.train(training_corpus, 
            total_examples = model.corpus_count, 
            epochs = model.epochs)

# Store model.
model.save(abs_dir + 'Data/Output/WordVectors/jqa-d2v.txt')

CPU times: user 34.7 s, sys: 888 ms, total: 35.6 s
Wall time: 16.2 s
