# Doc2Vec -- UMAP

In [1]:
import sys, os, string, glob, gensim, umap
import pandas as pd
import numpy as np

import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1 # This will be painfully slow otherwise
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)

# Import parser module.
module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path + '//Scripts')

from functions_xml_ET_parse import *

# Declare absolute path.
abs_dir = "/Users/quinn.wi/Documents/"

## Build Dataframe from XML

In [2]:
%%time

"""
Declare variables.
"""

# Declare regex to simplify file paths below
regex = re.compile(r'.*/.*/(.*.xml)')

# Declare document level of file. Requires root starting point ('.').
doc_as_xpath = './/ns:div/[@type="entry"]'

# Declare date element of each document.
date_path = './ns:bibl/ns:date/[@when]'

# Declare person elements in each document.
person_path = './/ns:p/ns:persRef/[@ref]'

# Declare subject elements in each document.
subject_path = './/ns:bibl//ns:subject'

# Declare text level within each document.
text_path = './ns:div/[@type="docbody"]/ns:p'

"""
Build dataframe.
"""

dataframe = []

for file in glob.glob(abs_dir + 'Data/PSC/JQA/*/*.xml'):
    reFile = str(regex.search(file).group(1))
#         Call functions to create necessary variables and grab content.
    root = get_root(file)
    ns = get_namespace(root)

    for eachDoc in root.findall(doc_as_xpath, ns):
#             Call functions.
        entry = get_document_id(eachDoc, '{http://www.w3.org/XML/1998/namespace}id')
        date = get_date_from_attrValue(eachDoc, date_path, 'when', ns)
        people = get_peopleList_from_attrValue(eachDoc, person_path, 'ref', ns)
        subject = get_subject(eachDoc, subject_path, ns)
        text = get_textContent(eachDoc, text_path, ns)

        dataframe.append([reFile, entry, date, people, subject, text])

dataframe = pd.DataFrame(dataframe, columns = ['file', 'entry', 'date', 
                                               'people', 'subject', 'text'])

# Split subject list and return "Multiple-Subject" or lone subject.
dataframe['subject'] = dataframe['subject'].str.split(r',')

def handle_subjects(subj_list):
    if len(subj_list) > 1:
        return 'Multiple-Subjects'
    else:
        return subj_list[0]
    
dataframe['subject'] = dataframe['subject'].apply(handle_subjects)

dataframe.head(4)

CPU times: user 3.54 s, sys: 66.3 ms, total: 3.6 s
Wall time: 3.65 s


Unnamed: 0,file,entry,date,people,subject,text
0,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-01,1808-08-01,"courtdegebelin-antoine,gregory-george,rousseau...",Recreation,"1. Bathed with George this morning, at the pla..."
1,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-02,1808-08-02,"degrand-peter,everett-alexander",Recreation,"2. Bathed again this Morning, and took George ..."
2,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-03,1808-08-03,"degrand-peter,welsh-thomas,davis-john,dawes-th...",Recreation,"3. Bathed this morning, at 6. with Mr: De Gran..."
3,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-04,1808-08-04,"boylston-ward,degrand-peter,adams-louisa-cathe...",Recreation,"4. Mr: Boylston called for me by appointment, ..."


## UMAP

In [3]:
%%time

model = Doc2Vec.load(abs_dir + 'Data/Output/WordVectors/jqa-d2v.txt')
docs = list(model.dv.index_to_key)

data = np.array(model[docs])
reducer = umap.UMAP()
embedding = reducer.fit_transform(data)

x = []
y = []

for e in embedding:
    x.append(e[0])
    y.append(e[1])
    
data_umap = pd.DataFrame({'entry': dataframe['entry'], 'date': dataframe['date'],
                          'subject': dataframe['subject'],
                          'x': x, 'y': y}) 

data_umap.head(3)

CPU times: user 1min 5s, sys: 671 ms, total: 1min 5s
Wall time: 13.8 s


Unnamed: 0,entry,date,subject,x,y
0,jqadiaries-v27-1808-08-01,1808-08-01,Recreation,2.519904,3.684093
1,jqadiaries-v27-1808-08-02,1808-08-02,Recreation,3.295887,2.960022
2,jqadiaries-v27-1808-08-03,1808-08-03,Recreation,2.90769,4.089373


In [4]:
%%time

data_umap.to_csv(abs_dir + 'Data/Output/WordVectors/jqa-d2v-umap.txt', 
                 sep = ',', index = False)

CPU times: user 1.24 s, sys: 75.1 ms, total: 1.31 s
Wall time: 122 ms


## Visualize

In [5]:
# %%time

# # Visualize
# fig = px.scatter(data_umap, x = 'x', y = 'y', 
#                  render_mode = 'webgl')

# fig.show()