In [2]:
# Imports
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import string
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [4]:
ALL_PAPER_DATA_PATH = '~/Documents/GitHub/datalab2/smac/all_paper_data.xlsx'
FIG_1G_OUT = 'figs/one-gram-timeseries/'
FIG_2G_OUT = 'figs/two-gram-timeseries/'

In [5]:
## Read in Trigger Other sheet from all_paper_data.xlsx
trigger_other = pd.read_excel(ALL_PAPER_DATA_PATH, sheet_name = 'Trigger Other')

In [6]:
# Grab columns we want, clean up the missing values (NA and 0)
by_laws = trigger_other[['t_q9','Trig_date']]
by_laws = by_laws.fillna('NA')
by_laws['t_q9'] = by_laws['t_q9'].replace(0, 'NA')

# Remove punctuation and lower
for index, row in by_laws.iterrows():
    row['t_q9'] = row['t_q9'].translate(str.maketrans('', '', string.punctuation)).lower()

# Convert to list of documents for vectorization
corpus = list(by_laws['t_q9'])

## Onegram Timeseries

In [9]:
# Get count matrix of each one-gram per document, throw into a dataframe
vectorizer = CountVectorizer(ngram_range = (1,1), stop_words='english')
X = vectorizer.fit_transform(corpus)
by_laws_1g = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

In [10]:
# Take a gander at the common ones
onegram_sums = pd.DataFrame(by_laws_1g.sum().sort_values(ascending=False),columns=['count']).reset_index().head(11)
print(onegram_sums)
top10 = list(onegram_sums.loc[1:10,'index'])

        index  count
0   community   3071
1   strangers   2780
2     allowed   2564
3        sick   2087
4         000   1493
5        hand   1339
6     washing   1204
7      burial   1203
8        bush   1169
9        dead   1150
10       fine   1107


In [11]:
# Have a column thats a sum accross all rows - total # of one-grams for that row
# by_laws_1g['total_1grams'] = np.array(by_laws_1g.sum(axis=1))

In [12]:
# Create trigger date column
# by_laws_1g['Trig_date'] = by_laws['Trig_date']
# by_laws_1g = by_laws_1g.groupby('Trig_date').sum().reset_index()

In [13]:
# # Set of words to plot timeseries for
# words = top10

# # plots plots plots
# for word in words:
#     fig = go.Figure()
#     by_laws_1g[word+'_freq'] = by_laws_1g[word]/by_laws_1g['total_1grams']
#     fig.add_trace(go.Scatter(x=by_laws_1g['Trig_date'], y= by_laws_1g[word+'_freq'], name=word, opacity=1))
#     fig.update_layout(
#                   title_text='By-Laws: \"'+word+'\" as Percentage of One-grams',
#                   yaxis_range=[0,0.6],
#                   xaxis_title="Date",
#                   yaxis_title="Percentage of One-grams",
#                   plot_bgcolor = 'rgba(0,0,0,0)',showlegend=True,
#                   font=dict(family="Computer Modern",color = "#000000", size=22))

#     fig.write_image(FIG_1G_OUT+word+'.png', width=800, height=400)

In [15]:
words = top10
for word in words:
    word_in_by_law = []
    for index, row in by_laws.iterrows():
        if word in row['t_q9']:
            word_in_by_law.append(1)
        else:
            word_in_by_law.append(0)
    by_laws[word] = word_in_by_law
by_laws['total_by_laws'] = [1]*len(by_laws)

In [17]:
by_laws['Trig_month'] = by_laws['Trig_date'].dt.strftime('%Y-%m')
by_laws_grouped = by_laws[words+['Trig_month','total_by_laws']].groupby('Trig_month').sum().head()

In [25]:
for word in words:
    by_laws_grouped[word] = by_laws_grouped[word]/by_laws_grouped['total_by_laws']
by_laws_grouped.head()

Unnamed: 0_level_0,strangers,allowed,sick,000,hand,washing,burial,bush,dead,fine,total_by_laws
Trig_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2014-10,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
2014-11,0.406122,0.516327,0.25102,0.114286,0.146939,0.195918,0.157143,0.106122,0.187755,0.128571,490
2014-12,0.303413,0.272727,0.274735,0.137941,0.156008,0.151133,0.160883,0.123028,0.148839,0.13622,3487
2015-01,0.262862,0.279125,0.22738,0.154051,0.202839,0.129805,0.142224,0.143111,0.125074,0.136014,3382
2015-02,0.261278,0.233083,0.25188,0.167293,0.327068,0.133459,0.129699,0.255639,0.165414,0.12218,532


In [26]:
# plots plots plots
for word in words:
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=by_laws_grouped.index, y= by_laws_grouped[word], name=word, opacity=1))
    fig.update_layout(
                  title_text='Percentage of By-Laws Containing \"'+word+'\" per Month',
#                   yaxis_range=[0,1],
                  xaxis_title="Date",
                  yaxis_title="Percentage of By-Laws",
                  plot_bgcolor = 'rgba(0,0,0,0)',showlegend=True,
                  font=dict(family="Computer Modern",color = "#000000", size=22))
    fig.show()
    fig.write_image(FIG_1G_OUT+word+'_bylaws_per_month.png', width=800, height=400)

In [24]:
# plots plots plots
fig = go.Figure()
for word in words:
    fig.add_trace(go.Scatter(x=by_laws_grouped.index, y= by_laws_grouped[word], name=word, opacity=1))
    fig.update_layout(
                  title_text='Percentage of By-Laws Containing \"'+word+'\" per Month',
#                   yaxis_range=[0,1],
                  xaxis_title="Date",
                  yaxis_title="Percentage of By-Laws",
                  plot_bgcolor = 'rgba(0,0,0,0)',showlegend=True,
                  font=dict(family="Computer Modern",color = "#000000", size=22))
#     fig.show()
fig.write_image(FIG_1G_OUT+'top10words_bylaws_per_month.png', width=800, height=400)
# fig.write_image(FIG_1G_OUT+word+'_bylaws_per_month.png', width=800, height=400)



# # plots
# for word in words:
#     fig = go.Figure()
#     fig.add_trace(go.Scatter(x=by_laws_grouped.index, y= by_laws_grouped[word], name=word, opacity=1, 
#                              groupnorm='percent', 
# #                              stackgroup='one'
#                             ))
#     fig.update_layout(
#                   title_text='Percentage of By-Laws Containing \"'+word+'\" per Month',
# #                   yaxis_range=[0,1],
#                   xaxis_title="Date",
#                   yaxis_title="Percentage of By-Laws",
#                   plot_bgcolor = 'rgba(0,0,0,0)',showlegend=True,
#                   font=dict(family="Computer Modern",color = "#000000", size=22))
#     fig.show()

#     fig.write_image(FIG_1G_OUT+word+'_bylaws_per_month.png', width=800, height=400)

## Bigram Timeseries

In [7]:
# Get count matrix of each bi-gram per document, throw into a dataframe
vectorizer = CountVectorizer(ngram_range = (2,2), stop_words='english')
X = vectorizer.fit_transform(corpus)
by_laws_2g = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

In [8]:
# Take a gander at the common ones
by_laws_2g.sum().sort_values(ascending=False).head(10)

strangers allowed    1174
bush meat             867
hand shaking          681
allowed community     676
sick person           643
sick people           635
public gathering      623
eating bush           588
washing dead          573
500 000               513
dtype: int64

In [9]:
# Take a gander at the common ones
twogram_sums = pd.DataFrame(by_laws_2g.sum().sort_values(ascending=False),columns=['count']).reset_index().head(11)
print(twogram_sums)
top10 = list(twogram_sums.loc[0:9,'index'])

                index  count
0   strangers allowed   1174
1           bush meat    867
2        hand shaking    681
3   allowed community    676
4         sick person    643
5         sick people    635
6    public gathering    623
7         eating bush    588
8        washing dead    573
9             500 000    513
10       hand washing    488


In [10]:
words = top10
for word in words:
    word_in_by_law = []
    for index, row in by_laws.iterrows():
        word_split = word.split(' ')
        if word_split[0] in row['t_q9'] and word_split[1] in row['t_q9']:
            word_in_by_law.append(1)
        else:
            word_in_by_law.append(0)
    by_laws[word] = word_in_by_law
by_laws['total_by_laws'] = [1]*len(by_laws)

In [11]:
by_laws['Trig_month'] = by_laws['Trig_date'].dt.strftime('%Y-%m')
by_laws_grouped = by_laws[words+['Trig_month','total_by_laws']].groupby('Trig_month').sum().head()

In [12]:
for word in words:
    by_laws_grouped[word] = by_laws_grouped[word]/by_laws_grouped['total_by_laws']
by_laws_grouped.head()

Unnamed: 0_level_0,strangers allowed,bush meat,hand shaking,allowed community,sick person,sick people,public gathering,eating bush,washing dead,500 000,total_by_laws
Trig_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2014-10,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
2014-11,0.238776,0.085714,0.053061,0.306122,0.112245,0.069388,0.028571,0.07551,0.126531,0.071429,490
2014-12,0.125036,0.095498,0.07657,0.145397,0.099226,0.101807,0.087468,0.065672,0.088041,0.07743,3487
2015-01,0.115021,0.125665,0.113247,0.11916,0.095506,0.059728,0.07806,0.077469,0.059432,0.068303,3382
2015-02,0.103383,0.234962,0.167293,0.090226,0.12594,0.031955,0.077068,0.182331,0.056391,0.073308,532


In [15]:
# Set of bi-grams to plot timeseries for
words = ['strangers allowed','bush meat','hand shaking','public gathering','eating bush','washing dead']

# plots plots plots
for word in words:
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=by_laws_grouped.index, y= by_laws_grouped[word], name=word, opacity=1))
    fig.update_layout(
                  title_text='Percentage of By-Laws Containing \"'+word+'\" per Month',
#                   yaxis_range=[0,0.6],
                  xaxis_title="Date",
                  yaxis_title="Percentage of By-Laws",
                  plot_bgcolor = 'rgba(0,0,0,0)',showlegend=True,
                  font=dict(family="Computer Modern",color = "#000000", size=22))

    fig.write_image(FIG_2G_OUT+word.replace(' ','')+'.png', width=1000, height=500)