In [18]:
%load_ext autoreload
%autoreload 2
import os, json, re, sys, time, warnings, datetime, glob
from html import unescape
import spacy
import pandas as pd
import numpy as np
import altair as alt

import scattertext as st
import agefromname
from tqdm.notebook import tqdm
from IPython.core.display import HTML, display
from IPython.display import IFrame
display(HTML("<style>.container { width:98% !important; }</style>"))
tqdm.pandas()
warnings.simplefilter('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Note: the visualizations only render properly in Chrome. Firefox users may see distorted word lists.

### Evolution of the meaning of the word "Woke"
- At most 5K tweets were pulled by querying for "woke" each month since Jan. 2015
- Tweets containing "woke up" are discarded
- We can look at the dynamics of the usage of "woke", and various trending topics during certain epochs



In [15]:
def read_df(file_name, query):
    return pd.read_csv(file_name)[[
        'url', 'date', 'renderedContent', 'user', 'Text', 
         'replyCount', 'retweetCount', 'likeCount', 'quoteCount', 'place'
    ]][lambda df: df.Text.apply(type) == str].assign(
        Query=query
    )[lambda df: df.user.apply(type) == str].assign(
        user = lambda df: df.user.apply(eval)
    )[lambda df: df.user.apply(type) == dict].assign(
        UserName = lambda df: df.user.apply(lambda x: x['username']),
        DisplayName = lambda df: df.user.apply(lambda x: x['displayname']),
        Month = lambda df: df.date.apply(lambda x: x[:7]),
        date = lambda df: pd.to_datetime(df.date),
        Quarter = lambda df: df.date.apply(lambda x: str(x.year) + 'q' + str(1 + ((x.month - 1)//3))),
        Third = lambda df: df.date.apply(lambda x: str(x.year) + 't' + str(1 + ((x.month - 1)//4))),
        Half = lambda df: df.date.apply(lambda x: str(x.year) + 'h' + str(1 + ((x.month - 1)//6))),            
    ).loc[
        lambda df: df[['Month', 'Text']].drop_duplicates()[
            lambda df: df.Text.apply(lambda x: not 'woke up' in x)
        ].index
    ][
        lambda df: df.Text != 'account is temporarily unavailable because it violates the twitter media policy. learn more.'
    ]
woke_df = read_df('woke_2015-2021.csv.gz', 'woke').assign(
    Parse = lambda df: df.Text.apply(unescape).progress_apply(st.whitespace_nlp_with_sentences)
).sort_values(by='date')

HBox(children=(FloatProgress(value=0.0, max=80308.0), HTML(value='')))




### Data examination - FYI
- I may have under collected data in 2018 and 2019

In [27]:
alt.Chart(
    pd.DataFrame(woke_df.Quarter.value_counts()).reset_index().rename(
        columns={'index': 'Quarter', 'Quarter': 'Tweet Count'}
    )
).mark_bar().encode(
    x='Quarter',
    y='Tweet Count'
)

### Ingest data into a Scattertext Corpus
- Retain non-stopwords; eliminate any links, short words, numbers, or "woke" itself
- Include 2500 words which are most associated with one quarter (using the JS-Divergence), and the 500 most frequent terms

In [39]:
corpus = st.CorpusFromParsedDocuments(
    woke_df,
    category_col='Third',
    parsed_col='Parse',
).build(
).get_stoplisted_unigram_corpus(
)

number_re = re.compile(r'\d+')
corpus = corpus.remove_terms([
    x for x in corpus.get_terms() 
    if x.startswith('http') or len(x) < 3 
    or x in ['woke', 'like', 'just'] 
    or number_re.match(x) is not None
]).compact(st.AssociationCompactor(
    2500,
    scorer=st.DeltaJSDivergence,
    term_ranker=st.OncePerDocFrequencyRanker,
    include_n_most_frequent_terms=500
))

### Diachronic Table
- Table
 - Most associated terms with each third of the year (epochs)
 - Ordered by geometric mean of $\ell$1-penalized logistic regression score for epoch and frequency
- Scatterplot
 - Residual the the DA dispersion measure (considering each epoch as a document) and each word's expected dispersion based on frequency
 - Top half: more evenly spread out over epochs
   - More thematic
 - Bottom half: more concentrated 
   - More trendy 

In [40]:
def get_heading(corpus):
    df = corpus.get_df()
    return (
        df['DisplayName'] + '; @' + df['UserName'] 
        + '; <a href="' + df['url'] + '">' 
        + df['date'].apply(str).str.split().apply(lambda x: x[0]) + '</a>'
    )

html = st.produce_scattertext_table(
    corpus,
    plot_width=1000,
    plot_height=400,
    top_terms_length=9,    
    use_full_doc=True,
    metadata=get_heading(corpus),
) 
fn = 'woke_diachronic.html'
open(fn, 'w').write(table_html)
open(fn, 'wb').write(('<h2>Evolution of "woke" on Twitter</h2>' + html).encode('utf-8'))
IFrame(src=fn, width = 1300, height=700)


### We can perform an analogous study on "BIPOC" 
- We use a very similar preprocessing and can see an evolution of left-wing discourse on Twitter since 2018
- Since we are dealing with a shorter time horizon, quarters are used for epochs

In [41]:
bipoc_df = read_df(
    'bipoc_post_2018q1.csv.gz', 'bipoc'
).reset_index(drop=True).assign(
    Parse = lambda df: df.Text.apply(unescape).progress_apply(st.whitespace_nlp),
    date = lambda df: pd.to_datetime(df.date),
    Quarter = lambda df: df.date.apply(lambda x: str(x.year) + 'q' + str(1 + ((x.month - 1)//3))),
    Third = lambda df: df.date.apply(lambda x: str(x.year) + 't' + str(1 + ((x.month - 1)//4))),
    Half = lambda df: df.date.apply(lambda x: str(x.year) + 't' + str(1 + ((x.month - 1)//6))),    
).sort_values(by='date')

HBox(children=(FloatProgress(value=0.0, max=119487.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=119487.0), HTML(value='')))




In [43]:
corpus = st.CorpusFromParsedDocuments(
    bipoc_df,
    category_col='Quarter',
    parsed_col='Parse',
).build(
).get_stoplisted_unigram_corpus(
)

number_re = re.compile(r'\d+')
corpus = corpus.remove_terms([
    x for x in corpus.get_terms() 
    if x.startswith('http') or len(x) < 3 
    or x in ['bipoc', 'like', 'just'] 
    or number_re.match(x) is not None
]).compact(st.AssociationCompactor(
    2500,
    scorer=st.DeltaJSDivergence,
    term_ranker=st.OncePerDocFrequencyRanker,
    include_n_most_frequent_terms=500
))

In [44]:
def get_heading(corpus):
    df = corpus.get_df()
    return (
        df['DisplayName'] + '; @' + df['UserName'] 
        + '; <a href="' + df['url'] + '">' 
        + df['date'].apply(str).str.split().apply(lambda x: x[0]) + '</a>'
    )

html = st.produce_scattertext_table(
    corpus,
    plot_width=1000,
    plot_height=400,
    top_terms_length=9,    
    use_full_doc=True,
    metadata=get_heading(corpus),
) 
fn = 'bipoc_diachronic.html'
open(fn, 'w').write(table_html)
open(fn, 'wb').write(('<h2>Evolution of "BIPOC" on Twitter</h2>' + html).encode('utf-8'))
IFrame(src=fn, width = 1300, height=700)
