# Doc2Vec -- Visualization

In [8]:
import sys, os, string, glob, gensim, umap
import pandas as pd
import numpy as np

import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1 # This will be painfully slow otherwise
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)

# Import parser module.
module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path + '//Scripts')

from functions_xml_ET_parse import *

# Declare absolute path.
abs_dir = "/Users/quinn.wi/Documents/"

## Build Dataframe from XML

In [14]:
%%time

"""
Declare variables.
"""

# Declare regex to simplify file paths below
regex = re.compile(r'.*/.*/(.*.xml)')

# Declare document level of file. Requires root starting point ('.').
doc_as_xpath = './/ns:div/[@type="entry"]'

# Declare date element of each document.
date_path = './ns:bibl/ns:date/[@when]'

# Declare person elements in each document.
person_path = './/ns:p/ns:persRef/[@ref]'

# Declare text level within each document.
text_path = './ns:div/[@type="docbody"]/ns:p'

"""
Build dataframe.
"""

dataframe = []

for file in glob.glob(abs_dir + 'Data/JQA/*/*.xml'):
    reFile = str(regex.search(file).group(1))
#         Call functions to create necessary variables and grab content.
    root = get_root(file)
    ns = get_namespace(root)

    for eachDoc in root.findall(doc_as_xpath, ns):
#             Call functions.
        entry = get_document_id(eachDoc, '{http://www.w3.org/XML/1998/namespace}id')
        date = get_date_from_attrValue(eachDoc, date_path, 'when', ns)
        people = get_peopleList_from_attrValue(eachDoc, person_path, 'ref', ns)
        text = get_textContent(eachDoc, text_path, ns)

        dataframe.append([reFile, entry, date, people, text])

dataframe = pd.DataFrame(dataframe, columns = ['file', 'entry', 'date', 'people', 'text'])

print (dataframe.shape)
dataframe.head(4)

(9116, 5)
CPU times: user 1.95 s, sys: 61.7 ms, total: 2.01 s
Wall time: 2.19 s


Unnamed: 0,file,entry,date,people,text
0,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-01,1817-10-01,"sullivan-john,coleman-unknown,divoff-unknown,b...",1. IV:30. Wednesday. Wrote a Letter to J. L. S...
1,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-02,1817-10-02,"delaplaine-joseph,waterhouse-benjamin,morris-c...",2. IV: Continued drafting instructions for Rus...
2,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-03,1817-10-03,"harris-levett,nourse-joseph,correa-joseph,jeff...",3. IV: I had visits this morning from Mr Levet...
3,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-04,1817-10-04,"hyde-de-neuville-jean,tingey-thomas,cardelli-p...",4. IV: I waked before three and had afterwards...


## UMAP

In [16]:
%%time

model = Doc2Vec.load(abs_dir + 'Data/Output/WordVectors/jqa-d2v.txt')
docs = list(model.dv.index_to_key)

data = np.array(model[docs])
reducer = umap.UMAP()
embedding = reducer.fit_transform(data)

x = []
y = []

for e in embedding:
    x.append(e[0])
    y.append(e[1])
    
data_umap = pd.DataFrame({'x': x, 'y': y}) # 'entry': dataframe['entry'], 

data_umap.head(3)

CPU times: user 1min 7s, sys: 361 ms, total: 1min 7s
Wall time: 7.75 s


Unnamed: 0,x,y
0,0.390249,7.930834
1,3.583296,5.794663
2,3.917478,5.729748


In [15]:
print (len(docs), len(x), len(y), dataframe.shape)


9117 9117 9117 (9116, 5)


## Visualize

In [18]:
%%time

# Visualize
fig = px.scatter(data_umap, x = 'x', y = 'y', 
                 render_mode = 'webgl')

fig.show()

CPU times: user 120 ms, sys: 2.26 ms, total: 122 ms
Wall time: 122 ms
