# Sentiment Analysis -- Taney

In [1]:
import re, csv, glob, spacy, warnings, sys, os
import pandas as pd
import numpy as np
from textblob import TextBlob

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from Correspondence_XML_parser import *

nlp = spacy.load('en_core_web_sm')

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

## Gather XML Files

In [2]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

input_directory = "Data/PSC/Taney/TaneyXML-Oct2020/*.xml"

# Gather all .xml files using glob.
files = glob.glob(abs_dir + input_directory)

CPU times: user 910 µs, sys: 1.14 ms, total: 2.05 ms
Wall time: 1.88 ms


In [None]:
# %%time

# # Must be connected to Northeastern's VPN.
# r = requests.get(url, 
#                  auth = (user, pw), 
#                  headers = {'Content-Type': 'application/xml'}
#                 )

# # Read in contents of pipeline.
# soup = BeautifulSoup(r.content, 'html.parser')

# # Split soup's content by \n (each line is a file path to an XML doc).
# # Use filter() to remove empty strings ('').
# # Convert back to list using list().
# files = list(filter(None, soup.text.split('\n')))

# # Filter list and retrieve only jqa/ files.
# files = [i for i in files if 'jqa/' in i]

# len(files)

## Build Dataframe

In [4]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
# df = build_dataframe(files, url, user, pw)

df = build_dataframe(files)


df.head(3)

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00009-collation.xml 

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00021-collation.xml 

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00022-collation.xml 

CPU times: user 18 ms, sys: 3.42 ms, total: 21.4 ms
Wall time: 20.4 ms


Unnamed: 0,file,date,source,target,subjects,references,text
0,RBT00107-collation.xml,1833-09-11,RBT,Ellicott-Thomas,,,Washington Sept. 11. 1833 My Dear Sir I hope ...
1,RBT00110-collation.xml,1833-09-19,RBT,Ellicott-Thomas,Bank War,,Washington Sept. 20th 1833 My Dear Sir I rece...
2,RBT00667-collation.xml,0000-00-00,RBT,Henshaw-David,"Bank of the United States,Treasury",,"October 2nd 183 Sir, It having been intimated..."


## Get Sentiments

In [5]:
%%time

def get_sentiment(txt):
    blob = TextBlob(txt)
    return blob.sentiment.polarity

df['sentiment'] = df['text'].apply(get_sentiment)

df.head(3)

CPU times: user 91.3 ms, sys: 2.52 ms, total: 93.8 ms
Wall time: 94.1 ms


Unnamed: 0,file,date,source,target,subjects,references,text,sentiment
0,RBT00107-collation.xml,1833-09-11,RBT,Ellicott-Thomas,,,Washington Sept. 11. 1833 My Dear Sir I hope ...,0.05
1,RBT00110-collation.xml,1833-09-19,RBT,Ellicott-Thomas,Bank War,,Washington Sept. 20th 1833 My Dear Sir I rece...,0.1
2,RBT00667-collation.xml,0000-00-00,RBT,Henshaw-David,"Bank of the United States,Treasury",,"October 2nd 183 Sir, It having been intimated...",0.157273


## Save Data

In [6]:
%%time

# Save results to lab space for visualizations.
df[['date', 'file', 'sentiment']].to_csv(os.path.abspath('../../lab_space/projects/taney/sentiments/data/') + '/taney_sentiments.csv', 
          sep = ',', index = False)

CPU times: user 2.9 ms, sys: 1.62 ms, total: 4.52 ms
Wall time: 4.46 ms
