In [157]:
# Imports
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import string
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [159]:
ALL_PAPER_DATA_PATH = '~/Documents/GitHub/datalab2/smac/all_paper_data.xlsx'
FIG_1G_OUT = 'figs/one-gram-timeseries/'
FIG_2G_OUT = 'figs/two-gram-timeseries/'

In [76]:
## Read in Trigger Other sheet from all_paper_data.xlsx
trigger_other = pd.read_excel(ALL_PAPER_DATA_PATH, sheet_name = 'Trigger Other')

In [160]:
# Grab columns we want, clean up the missing values (NA and 0)
by_laws = trigger_other[['t_q9','Trig_date']]
by_laws = by_laws.fillna('NA')
by_laws['t_q9'] = by_laws['t_q9'].replace(0, 'NA')

# Remove punctuation and lower
for index, row in by_laws.iterrows():
    row['t_q9'] = row['t_q9'].translate(str.maketrans('', '', string.punctuation)).lower()

# Convert to list of documents for vectorization
corpus = list(by_laws['t_q9'])

## Onegram Timeseries

In [161]:
# Get count matrix of each one-gram per document, throw into a dataframe
vectorizer = CountVectorizer(ngram_range = (1,1), stop_words='english')
X = vectorizer.fit_transform(corpus)
by_laws_1g = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

In [162]:
# Take a gander at the common ones
by_laws_1g.sum().sort_values(ascending=False).head(10)

community    3071
strangers    2780
allowed      2564
sick         2087
000          1493
hand         1339
washing      1204
burial       1203
bush         1169
dead         1150
dtype: int64

In [163]:
# Have a column thats a sum accross all rows - total # of one-grams ocurrences for that row
by_laws_1g['total_1grams'] = np.array(by_laws_1g.sum(axis=1))

In [164]:
# Create trigger date column
by_laws_1g['Trig_date'] = by_laws['Trig_date']
by_laws_1g = by_laws_1g.groupby('Trig_date').sum().reset_index()

In [165]:
# Set of words to plot timeseries for
words = ['community','strangers','burial','gathering','hand','bush']

# plots plots plots
for word in words:
    fig = go.Figure()
    by_laws_1g[word+'_freq'] = by_laws_1g[word]/by_laws_1g['total_1grams']
    fig.add_trace(go.Scatter(x=by_laws_1g['Trig_date'], y= by_laws_1g[word+'_freq'], name=word, opacity=1))
    fig.update_layout(
                  title_text='By-Laws: \"'+word+'\" as Percentage of One-grams',
                  yaxis_range=[0,0.6],
                  xaxis_title="Date",
                  yaxis_title="Percentage of One-grams",
                  plot_bgcolor = 'rgba(0,0,0,0)',showlegend=True,
                  font=dict(family="Computer Modern",color = "#000000", size=22))

    fig.write_image(FIG_1G_OUT+word+'.pdf', width=800, height=400)

## Bigram Timeseries

In [166]:
# Get count matrix of each bi-gram per document, throw into a dataframe
vectorizer = CountVectorizer(ngram_range = (2,2), stop_words='english')
X = vectorizer.fit_transform(corpus)
by_laws_2g = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

In [167]:
# Take a gander at the common ones
by_laws_2g.sum().sort_values(ascending=False).head(10)

strangers allowed    1174
bush meat             867
hand shaking          681
allowed community     676
sick person           643
sick people           635
public gathering      623
eating bush           588
washing dead          573
500 000               513
dtype: int64

In [168]:
# Have a column thats a sum accross all rows - total # of one-grams ocurrences for that row
by_laws_2g['total_2grams'] = np.array(by_laws_2g.sum(axis=1))

In [169]:
# Create trigger date column
by_laws_2g['Trig_date'] = by_laws['Trig_date']
by_laws_2g = by_laws_2g.groupby('Trig_date').sum().reset_index()

In [173]:
# Set of bi-grams to plot timeseries for
words = ['strangers allowed','bush meat','hand shaking','public gathering','eating bush','washing dead']

# plots plots plots
for word in words:
    fig = go.Figure()
    by_laws_2g[word+'_freq'] = by_laws_2g[word]/by_laws_2g['total_2grams']
    fig.add_trace(go.Scatter(x=by_laws_2g['Trig_date'], y= by_laws_2g[word+'_freq'], name=word, opacity=1))
    fig.update_layout(
                  title_text='By-Laws: \"'+word+'\" as Percentage of Bi-grams',
                  yaxis_range=[0,0.6],
                  xaxis_title="Date",
                  yaxis_title="Percentage of Bi-grams",
                  plot_bgcolor = 'rgba(0,0,0,0)',showlegend=True,
                  font=dict(family="Computer Modern",color = "#000000", size=22))

    fig.write_image(FIG_2G_OUT+word.replace(' ','')+'.pdf', width=800, height=400)