# Subject Flow Chart

In [2]:
# Import necessary libraries.
import re, nltk, warnings, csv, sys, os, pickle, string
import pandas as pd
import numpy as np
import seaborn as sns
from itertools import chain
from scipy import stats
import matplotlib.pyplot as plt

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

# Read in file; select columns; drop rows with NA values (entries without a named person).
df = pd.read_csv(abs_dir + 'Data/Output/ParsedXML/JQA_Subjects-dataframe.txt',
                 sep = '\t') \
    .drop(columns = ['file', 'text']) \
    .dropna()

# Unnest subject headings. 
df['subject'] = df['subject'].str.split(',')
df = df.explode('subject')

# Remove leading and trailing whitespace.
df['subject'] = df['subject'].str.strip()

# Remove rows with subject of "The".
df = df[~df['subject'].isin(['The'])]

print (f'Number of unique subject headings: {len(df["subject"].unique())}\nDF Shape: {df.shape}')

df.head()

Number of unique subject headings: 70
DF Shape: (5913, 3)


Unnamed: 0,entry,date,subject
0,jqadiaries-v30-1817-10-01,1817-10-01,Adams Family Residences
0,jqadiaries-v30-1817-10-01,1817-10-01,Commerce
1,jqadiaries-v30-1817-10-02,1817-10-02,Foreign Relations
1,jqadiaries-v30-1817-10-02,1817-10-02,Health and Illness
1,jqadiaries-v30-1817-10-02,1817-10-02,South American Wars of Independence


In [20]:
%%time

# Extract month, year from date.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d', errors = 'ignore')
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Group by month, year and count subjects.
subjects = df.groupby(['year', 'subject'], as_index = False)['subject'] \
    .size() \
    .reset_index()

subjects.columns = ['year', 'subject', 'count']

subjects

CPU times: user 8.53 ms, sys: 1.49 ms, total: 10 ms
Wall time: 8.5 ms


Unnamed: 0,year,subject,count
0,1817,Adams Family Finances,14
1,1817,Adams Family Relations,1
2,1817,Adams Family Residences,17
3,1817,Adams-Onis Treaty,4
4,1817,African Americans,2
...,...,...,...
391,1826,Treaty of Ghent,2
392,1826,U.S. Constitution,1
393,1826,Unitarianism,1
394,1826,War of 1812,2


In [23]:
%%time

subjects.to_csv(abs_dir + 'GitHub/dsg-mhs/lab_space/data/subjects/subject-year-count.csv',
                sep = ',', index = False)

CPU times: user 1.82 ms, sys: 1.04 ms, total: 2.87 ms
Wall time: 2.1 ms
