# Build Simple word2vec Model -- Sedgwick


* Code Sampled from [Laura K. Nelson's GitHub](https://github.com/lknelson/measuring_intersectionality/blob/main/scripts/00_measuringintersectionality_constructmodels.ipynb)

In [1]:
import sys, os, string, glob, gensim, warnings
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize

# Read in config.py (git ignored file) for API username and pw.
config_path = os.path.abspath(os.path.join(os.path.dirname('config.py'), '../Scripts'))
sys.path.append(config_path)
import config

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from Correspondence_XML_parser import *

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

url = 'https://dsg.xmldb-dev.northeastern.edu/BaseX964/rest/psc/'
user = config.username
pw = config.password

# Define tokenizer.
def fast_tokenize(text):
    
    # Get a list of punctuation marks
    punct = string.punctuation + '“' + '”' + '‘' + "’"
    
    lower_case = text.lower()
    lower_case = lower_case.replace('—', ' ').replace('\n', ' ')
    
    # Iterate through text removing punctuation characters
    no_punct = "".join([char for char in lower_case if char not in punct])
    
    # Split text over whitespace into list of words
    tokens = no_punct.split()
    
    return tokens

In [2]:
gensim.__version__

'4.0.1'

## Gather XML Files

In [3]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"
files = glob.glob(abs_dir + "Data/PSC/Sedgwick/*.xml")

len(files)

CPU times: user 1.03 ms, sys: 1.39 ms, total: 2.42 ms
Wall time: 1.35 ms


122

In [4]:
# %%time

# # Must be connected to Northeastern's VPN.
# r = requests.get(url, 
#                  auth = (user, pw), 
#                  headers = {'Content-Type': 'application/xml'}
#                 )

# # Read in contents of pipeline.
# soup = BeautifulSoup(r.content, 'html.parser')

# # Split soup's content by \n (each line is a file path to an XML doc).
# # Use filter() to remove empty strings ('').
# # Convert back to list using list().
# files = list(filter(None, soup.text.split('\n')))

# # Filter list and retrieve only jqa/ files.
# files = [i for i in files if '......./' in i]

# len(files)

## Build Dataframe from XML

In [5]:
%%time
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
df = build_dataframe(files)

df.head(3)

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1803-10-06-toPamelaDwightSedgwickF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1809-01-27-toTheodoreSedgwickIFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-25-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1806-01-17-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-29-toPamelaDwightSedgwickFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFSWF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1800-01-12-toTheodoreSedgwickIF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-15-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-28-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-03-24-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/

Unnamed: 0,file,date,source,target,subjects,references,text
0,CMS1819-03-08-toRobertSedgwickIF (1).xml,1819-03-08,Catharine Maria Sedgwick,sedgwick-robert,,"sedgwick-charles,sedgwick-elizabeth,sedgwick-h...",Albany March 8' 1819 -- I came here my dear Ro...
1,CMS1816-03-25-toFrancesSedgwickWatsonF.xml,1816-03-25,Catharine Maria Sedgwick,FSW,,"RSI,banyer-maria,jay-sarah,van vechten-jacob,s...",Albany March 25th 1816 I have just heard of an...
2,CMS1813-08-15-toRobertSedgwickIF.xml,1813-08-15,Catharine Maria Sedgwick,RSI,,"FSW,U,payne-eloise,warner-thomas,warner-france...",Stockbridge August 15th 1813 I recollect very...


## Build w2v Model

In [9]:
%%time

# Convert dataframe text field to list of sentences.
sentences = [sentence for text in df['text'] for sentence in sent_tokenize(text)]
words_by_sentence = [fast_tokenize(sentence) for sentence in sentences]
words_by_sentence = [sentence for sentence in words_by_sentence if sentence != []]

# Get total number of words and unique words.
single_list_of_words = []
for l in words_by_sentence:
    for w in l:
        single_list_of_words.append(w)
print (f'Word total: {len(single_list_of_words)}\nUnique word total {len(set(single_list_of_words))}')

# Build model.
model = gensim.models.Word2Vec(words_by_sentence, window=5, vector_size=100,
                               min_count=1, sg=1, alpha=0.025, batch_words=10000, workers=4)

# Unused arguments:
# size=100, iter=5,

# Save model for later use
model.wv.save_word2vec_format(abs_dir + '/Data/Output/WordVectors/sedgwick_w2v.txt')

Word total: 68087
Unique word total 6797
CPU times: user 1.83 s, sys: 23.1 ms, total: 1.86 s
Wall time: 1 s


## Analysis

In [11]:

print ('Words most similar to "independent":\n', [word for word, score in model.wv.most_similar(['independent'])], '\n')

print ('Words most similar to "food":\n', [word for word, score in model.wv.most_similar(['food'])], '\n')


Words most similar to "independent":
 ['high', 'morgan', 'humble', 'funeral', 'russell', 'surprise', 'history', 'immortal', 'figure', 'pains'] 

Words most similar to "food":
 ['noble', 'doesnot', 'equal', 'essay', 'sainted', 'occupations', 'la', 'ardor', 'steam', 'work'] 

