In [33]:
from spacy.lang.en.stop_words import STOP_WORDS
from pprint import pprint

import json
import nltk
import os
import spacy
import string
import re

from gensim import corpora
from gensim import models
from tqdm import tqdm
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

FILTER_TABLE_OF_CONTENTS = re.compile('\.{4,}')

def preprocess(text):
    text = FILTER_TABLE_OF_CONTENTS.sub('...', text)
    return text

nlp = spacy.load("en_core_web_trf")
nlp.max_length = 16000000

author = 'sunstein'
in_corpus = 'lapdf'

if author == 'sunstein':
    years = list(range(1987, 2006))
    out_file = open(f'results/{author}_{in_corpus}_ldaseq_analysis.txt', 'w')
else:
    years = list(range(2004, 2015))
    out_file = open(f'results/{author}_ldaseq_analysis.txt', 'w')

print('Reading text')
words = []
if author == 'sunstein':
    for file in tqdm(os.listdir('sunstein/processed/')):
        if file.endswith(f'_{in_corpus}.txt'):
            in_text = open('sunstein/processed/' + file).read()
            in_text = preprocess(in_text)
            for token in tokenizer.tokenize(in_text):
                new_token = tokenizer.convert_tokens_to_string(token)
                if new_token not in STOP_WORDS and new_token not in string.punctuation:
                    words.append(new_token)
else:
    with open('becker-posner.json') as f:
        data = json.load(f)
        for article in tqdm(data):
            for token in tokenizer.tokenize(article['text']):
                new_token = tokenizer.convert_tokens_to_string(token)
                if new_token not in STOP_WORDS and new_token not in string.punctuation:
                    words.append(new_token)

all_year_sents = []
for year in years:
    year_sents = []
    text = ''
    if author == 'sunstein':
        for file in tqdm(os.listdir('sunstein/processed/')):
            if file[:4] == str(year) and file.endswith(f'_{in_corpus}.txt'):
                in_text = open('sunstein/processed/' + file).read()
                in_text = preprocess(in_text)
                for token in tokenizer.tokenize(in_text):
                    new_token = tokenizer.convert_tokens_to_string(token)
                    if new_token not in STOP_WORDS and new_token not in string.punctuation:
                        year_sents.append(new_token)

    else:
        with open('becker-posner.json') as f:
            data = json.load(f)
            for article in tqdm(data):
                if article['date'][-4:] == str(year):
                    in_text = tokenizer.tokenize(article['text'])
                    in_text = preprocess(in_text)
                    for token in tokenizer.tokenize(in_text):
                        new_token = tokenizer.convert_tokens_to_string(token)
                        if new_token not in STOP_WORDS and new_token not in string.punctuation:
                            year_sents.append(new_token)
    all_year_sents.append(year_sents)

Reading text


  0%|          | 0/1432 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (7631 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1432/1432 [00:13<00:00, 109.88it/s]
100%|██████████| 1432/1432 [00:00<00:00, 4967.75it/s]
100%|██████████| 1432/1432 [00:00<00:00, 7737.12it/s]
100%|██████████| 1432/1432 [00:00<00:00, 6164.66it/s]
100%|██████████| 1432/1432 [00:00<00:00, 4902.44it/s]
100%|██████████| 1432/1432 [00:00<00:00, 3964.80it/s]
100%|██████████| 1432/1432 [00:00<00:00, 3589.31it/s]
100%|██████████| 1432/1432 [00:00<00:00, 2881.13it/s]
100%|██████████| 1432/1432 [00:00<00:00, 2193.45it/s]
100%|██████████| 1432/1432 [00:00<00:00, 1981.08it/s]
100%|██████████| 1432/1432 [00:01<00:00, 992.27it/s]
100%|██████████| 1432/1432 [00:00<00:00, 2898.75it/s]
100%|██████████| 1432/1432 [00:00<00:00, 2157.19it/s]
100%|██████████| 1432/1432 [00:01<00:00, 988.54it/s]
100%|██████████| 1432/14

In [None]:
from gensim.models import ldaseqmodel
# Convert sentences to bags of words
print('Creating dictionary')
full_dict = corpora.Dictionary([words])
print('Converting to bag of words')
bow = [full_dict.doc2bow(year) for year in all_year_sents]
print('Creating LdaSeqModel')

time_slice = [1]*len(years)
ldaseq = ldaseqmodel.LdaSeqModel(corpus=bow, id2word=full_dict, time_slice=time_slice, num_topics=5)

Creating dictionary
Converting to bag of words
Creating LdaSeqModel


In [17]:
ldaseq.print_topic_times(topic=0)

[[('.', 0.05185395646260935),
  ('Ġof', 0.035908535805602236),
  (',', 0.03451389386695718),
  ('Ċ', 0.030984589120270273),
  ('Ġand', 0.024991255954984765),
  ('/', 0.02442428455756526),
  (':', 0.022812417759789212),
  ('ĠLaw', 0.018498162200059683),
  ('Ġthe', 0.014141482862053039),
  ('ĠSun', 0.013928882271134183),
  ('ĠR', 0.013909658484096726),
  ('Ġbe', 0.013867692834365931),
  ('stein', 0.013850194902929101),
  ('Ġis', 0.01265531653009322),
  ('or', 0.012585479154714673),
  ('2', 0.012274212459282265),
  ('-', 0.011193790090756559),
  ('Ġby', 0.009519442528811655),
  ('Sand', 0.009229857917197123),
  ('Co', 0.009151827624721026)],
 [('.', 0.052277995808665385),
  ('Ġof', 0.03533595627290811),
  (',', 0.034648366845968066),
  ('Ċ', 0.031225497592052794),
  ('Ġand', 0.02509350297262424),
  ('/', 0.02472921268359554),
  (':', 0.023079110775626584),
  ('ĠLaw', 0.01859872025942108),
  ('Ġthe', 0.01413946158128513),
  ('ĠSun', 0.013973448128527506),
  ('ĠR', 0.013953212828135605),
  