In [115]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
import math
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
import pandas as pd
import scipy

import little_mallet_wrapper as lmw

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Clean Data

In [118]:
# 2018

# for each paper -- run once, see which words are not in the dictionary,
# add valid missing words to dictionary
# run again to confirm caught appropriate missing words
# set finish_processing=True to write the cleaned file
def clean_paper_2018(papername, paper_path, finish_processing=False):
    full_filename = os.path.join(papers_path_root + "2018", papername)
    # checking if it is a file, cleaning part done in one place
    if os.path.isfile(full_filename):
        # open file for reading
        f = open(full_filename, 'r')
        file_contents = f.read()
        # for simplicity of running, make sure that after process once,
        # can re-process/ is idempotent
        
        # remove proceedings info from the file; from "Proceedings" through "ABSTRACT"
        cleaned_paper = file_contents
        if "Proceedings" in cleaned_paper: # TODO make sure case insensitive
            res = cleaned_paper.split("Proceedings")
            # keep title
            title = res[0]
            # remove authors, editors, proceedings info
            if "ABSTRACT" in cleaned_paper: # TODO make sure case insensitive
                res = res[1].split("ABSTRACT")
                if title:
                    cleaned_paper = title + res[1]
                else:
                    cleaned_paper = res[1]
        
        # remove acknowledgements
        if 'Acknowledgments' in cleaned_paper:       
            res = cleaned_paper.split('Acknowledgments')
            cleaned_paper = res[0]
        elif 'Acknowledgements'in cleaned_paper:
            res = cleaned_paper.split('Acknowledgements')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGEMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGMENTS')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGEMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGMENTS')
            cleaned_paper = res[0]
        
        # remove references, if the above didn't get them when removed acknowledgements
        if 'References' in cleaned_paper:       
            res = cleaned_paper.split('References')
            cleaned_paper = res[0]
        elif 'REFERENCES'in cleaned_paper:
            res = cleaned_paper.split('REFERENCES')
            cleaned_paper = res[0]
        
        # remove line breaks / i.e. rejoin hyphenated words; i.e., find "-\n"
        cleaned_paper = cleaned_paper.replace("-\n", "")
                
        # remove URLs; this is a quick and dirty way to do it
        cleaned_paper = re.sub(r'http\S+', '', cleaned_paper)
        
        # remove all non-alpha to take care of math; replace with '' for now
        rx = r'[^\w\n\s\-\/]+'
        #rx = r'[^a-zA-Z\n\s\-]'
        cleaned_paper = re.sub(rx, '', cleaned_paper)
        cleaned_paper = re.sub(r'\d', '', cleaned_paper)
        # TODO clean up with dict
        paper_as_tokens = cleaned_paper.split()
        # There is a more efficient way to do this...but I don't care
        missed_words = set()
        print("Words removed from: " + papername)
        for idx, token in enumerate(paper_as_tokens):
            if token.lower() not in dictionary and token.lower() not in missed_words:
                missed_words.add(token.lower())
                print(token.lower())
                # remove bad token
                paper_as_tokens[idx] = ""
        f.close()
    
    #print(cleaned_paper)
    # now overwrite the file
    if os.path.isfile(full_filename):
        print("Writing intermediate out file: {}".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()
    
    # set to true after completing manual cleaning
    if finish_processing and os.path.isfile(full_filename):
        cleaned_paper = ' '.join(paper_as_tokens)
        print("Writing final cleaned file: {}\n".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()


In [229]:
# 2019 -- copying and modifying above; refactoring this would just be more annoying

# for each paper -- run once, see which words are not in the dictionary,
# add valid missing words to dictionary
# run again to confirm caught appropriate missing words
# set finish_processing=True to write the cleaned file

license1 = """Permission to make digital or hard copies of all or part of this work for personal or
classroom use is granted without fee provided that copies are not made or distributed
for profit or commercial advantage and that copies bear this notice and the full citation
on the first page. Copyrights for components of this work owned by others than ACM
must be honored. Abstracting with credit is permitted. To copy otherwise, or republish,
to post on servers or to redistribute to lists, requires prior specific permission and/or a
fee. Request permissions from permissions@acm.org."""

proceedings = """FAT* ’19, January 29–31, 2019, Atlanta, GA, USA
© 2019 Association for Computing Machinery.
ACM ISBN 978-1-4503-6125-5/19/01. . . $15.00"""

def clean_paper_2019(papername, paper_path, finish_processing=False):
    full_filename = os.path.join(papers_path_root + "2019", papername)
    # checking if it is a file, cleaning part done in one place
    if os.path.isfile(full_filename):
        # open file for reading
        f = open(full_filename, 'r')
        file_contents = f.read()
        # for simplicity of running, make sure that after process once,
        # can re-process/ is idempotent
        
        # remove "ABSTRACT"
        cleaned_paper = file_contents
        if "ABSTRACT" in cleaned_paper: # TODO make sure case insensitive
            res = cleaned_paper.split("ABSTRACT")
            cleaned_paper = res[0] + " " + res[1]
        
        # remove license
        if license1 in cleaned_paper:
            res = cleaned_paper.split(license1)
            cleaned_paper = res[0] + res[1]
        
        # remove proceedings
        if proceedings in cleaned_paper:
            res = cleaned_paper.split(proceedings)
            cleaned_paper = res[0] + res[1]
        
        # remove acknowledgements
        if 'Acknowledgments' in cleaned_paper:       
            res = cleaned_paper.split('Acknowledgments')
            cleaned_paper = res[0]
        elif 'Acknowledgements'in cleaned_paper:
            res = cleaned_paper.split('Acknowledgements')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGMENTS')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGEMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGEMENTS')
            cleaned_paper = res[0]
        
        # remove references, if the above didn't get them when removed acknowledgements
        if 'References' in cleaned_paper:       
            res = cleaned_paper.split('References')
            cleaned_paper = res[0]
        elif 'REFERENCES'in cleaned_paper:
            res = cleaned_paper.split('REFERENCES')
            cleaned_paper = res[0]
        
        # remove line breaks / i.e. rejoin hyphenated words; i.e., find "-\n"
        cleaned_paper = cleaned_paper.replace("-\n", "")
        # remove ccs
        cleaned_paper = cleaned_paper.replace("CCS", "")
                
        # remove URLs; this is a quick and dirty way to do it
        cleaned_paper = re.sub(r'http\S+', '', cleaned_paper)
        
        # remove all non-alpha to take care of math; replace with '' for now
        rx = r'[^\w\n\s\-\/]+'
        #rx = r'[^a-zA-Z\n\s\-]'
        cleaned_paper = re.sub(rx, '', cleaned_paper)
        cleaned_paper = re.sub(r'\d', '', cleaned_paper)
        
        # remove reference format
        if "ACM Reference Format" in cleaned_paper:
            res = cleaned_paper.split("ACM Reference Format")
            res2 = res[1].split("""ACM New York NY USA pages""")
            cleaned_paper = res[0] + " " + res2[1]
        
        # TODO clean up with dict
        paper_as_tokens = cleaned_paper.split()
        # There is a more efficient way to do this...but I don't care
        missed_words = set()
        print("Words removed from: " + papername)
        for idx, token in enumerate(paper_as_tokens):
            if token.lower() not in dictionary and token.lower() not in missed_words:
                missed_words.add(token.lower())
                print(token.lower())
                # remove bad token
                paper_as_tokens[idx] = ""
        f.close()
        
        #print(cleaned_paper)
    # now overwrite the file
    if os.path.isfile(full_filename):
        print("Writing intermediate out file: {}".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()
    
    # set to true after completing manual cleaning
    if finish_processing and os.path.isfile(full_filename):
        cleaned_paper = ' '.join(paper_as_tokens)
        print("Writing final cleaned file: {}\n".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()


In [134]:
# load up dictionary
papers_path_root = '../../Data/TXTs/CleanedPapers_LDA/'

dictionary = set()

with open(papers_path_root + "dictionary.txt", encoding="utf-8") as f:
    for line in f:
        word = line.strip()
        if word and "#" not in word:
            dictionary.add(word)

In [228]:
# We need to clean one at a time to make sure that we are spot-checking everything, sadly. I will do like 30 a day
# using this a tracker, but also will keep a csv
# when we are down, I will remove this, having updated the code above to clean everything appropriately
# and will implement a for-loop for reproducibility

# this is imperfect; we end up with some math stuff and weird spacing from the tables. But I think overall
# it does a pretty good job

restart = False

if restart:
    # 2018: papers 1-15
    clean_paper_2018("binns18a", papers_path_root, True) # 1
    clean_paper_2018("barabas18a", papers_path_root, True) # 2
    clean_paper_2018("buolamwini18a", papers_path_root, True) # 3
    clean_paper_2018("burke18a", papers_path_root, True) # 4
    clean_paper_2018("chouldechova18a", papers_path_root, True) # 5
    clean_paper_2018("datta18a", papers_path_root, True) # 6
    clean_paper_2018("dwork18a", papers_path_root, True) #7
    clean_paper_2018("ekstrand18a", papers_path_root, True) # 8 
    clean_paper_2018("ekstrand18b", papers_path_root, True) # 9
    clean_paper_2018("ensign18a", papers_path_root, True) # 10
    clean_paper_2018("kamishima18a", papers_path_root, True) # 11
    clean_paper_2018("madaan18a", papers_path_root, True) # 12
    clean_paper_2018("menon18a", papers_path_root, True) # 13
    clean_paper_2018("phillips18a", papers_path_root, True) # 14
    clean_paper_2018("speicher18a", papers_path_root, True) # 15
    # 2019
    # manually remove author list for each 2019 paper
    clean_paper_2019("3287560.3287561.txt", papers_path_root, True) # 16 

clean_paper_2019("3287560.3287562.txt", papers_path_root) # 17     

#fns = os.listdir(papers_path_root + "2019")
#fns.sort()
#for fn in fns:
#    print(papers_path_root + "2019/" + fn)
    

Words removed from: 3287560.3287562.txt
gradient-based
two-layer
relu
learning-theoretic
highlights
gradients
complementing
heuristics
reconstructing
query-efficient
computingmethodologiesmachine
saliency
f
wx
vectorw
parametersw
transitions
wreluax
rhd
reluu
maxu
coordinatewise
doi
/
smitha
milli
ludwig
schmidt
anca
dragan
moritz
non-linear
recovers
fromoh
logh
theoh
dh
h
ωdh
enjoys
anti-concentration
exacerbating
twolayer
mnist
cifar
depth-
k-layer
withh
okh
ˆf
hidden-layer
activations
maxai
themodel
andw
useai
i-th
ofa
rowsai
andaj
j
aj
reparameterizations
scalingw
δ
notifies
inodh
дxiwia
дx
iax
separating
hyperplanes
cells
wiai
amatrixz
thatzpi
orzpi
maxzx
learnmodelh
ϵ
recoverz
recoversz
recoverzh
uv
zi
binarysearchtl
tm
xl
tlv
xm
tmv
xr
trv
rdh
xh
rankzx
recovering
дxiwiai
hyperplane
isolates
recoverwiai
betweenulv
andulv
thematrixz
probabilityf
towiai
rowswiai
tiv
zpi
k
tkh
binarysearch
wkiaki
orwkiaki
tki
minj
terminates
tkj
eitherwkiaki
rfx
lv
andv
recovered
recoverswiai
tkl
-

<br><br>

# Load data

In [None]:
year = "2018"

def make_paper_df(year, root_dir_path):
    dir_name = papers_path_root + year
    i = 1
    data = []
    for filename in os.listdir(dir_name):
        full_filename = os.path.join(dir_name, filename)
        # checking if it is a file
        if os.path.isfile(full_filename):
            f = open(full_filename, 'r')
            file_contents = f.read()
            #print(file_contents)
            paper_id = "{}-{}".format(year, i)
            data.append([paper_id, year, filename, file_contents])
            f.close()
            i = i + 1

    return pd.DataFrame(data, columns = ['ID', 'Year', 'FileName', 'Text'])


df_2018 = make_paper_df('2018', papers_path_root)
df_2019 = make_paper_df('2019', papers_path_root)
df_2020 = make_paper_df('2020', papers_path_root)
df_2021 = make_paper_df('2021', papers_path_root)

df_all_papers = df_2018.append(df_2019).append(df_2020).append(df_2021)
df_all_papers.reset_index(inplace=True, drop=True) 

print("2018 total papers: {}".format(len(df_2018.index)))
print("2019 total papers: {}".format(len(df_2019.index)))
print("2020 total papers: {}".format(len(df_2020.index)))
print("2021 total papers: {}".format(len(df_2021.index)))
print("Total papers: {}".format(len(df_all_papers.index)))
    
        

In [None]:
df_all_papers.sample(3)

<br><br>

# Try topic modeling

Process each paper into chunks and keep track of the year for each chunk.

In [None]:
training_texts = []
original_texts = []
training_years = []
# If want to do topic breakdown / distribution per paper, then add training_ids list
chunk_size = 200
min_leftover_chunk_size = 20

for i, r in df_all_papers.iterrows():
    _chunks = [' '.join(r['Text'].split()[i:i+200]) for i in range(0, len(r['Text'].split()), chunk_size)]
    # TODO -- consider custom stop words?
    _processed_chunks = [lmw.process_string(c, remove_stop_words=True, remove_short_words=False).strip() for c in _chunks]
    _processed_chunks = [c for c in _processed_chunks if len(c.split()) >= min_leftover_chunk_size]

    for i, c in enumerate(_processed_chunks):
        training_texts.append(c)
        original_texts.append(_chunks[i])
        training_years.append(r['Year'])

len(training_texts), len(training_years)

In [None]:
num_topics = 20  
output_directory_path = 'output' 
#path_to_mallet = '/Volumes/Passport-1/packages/mallet-2.0.8/bin/mallet'
path_to_mallet = "~/mallet-2.0.8/bin/mallet"


In [None]:
topic_keys, topic_distributions = lmw.quick_train_topic_model(path_to_mallet, 
                                                              output_directory_path, 
                                                              num_topics, 
                                                              training_texts)

In [None]:
assert(len(topic_distributions) == len(training_texts))

In [None]:
for i, t in enumerate(topic_keys):
    print(i, '\t', ' '.join(t[:20]))

In [None]:
for _topic in range(0, num_topics):
    print('---------------------------------------------------------------------------')
    print('TOPIC ' + str(_topic) + ': ' + ' '.join(topic_keys[_topic][:5]))
    print('---------------------------------------------------------------------------')
    print()
    for p, d in lmw.get_top_docs(original_texts, topic_distributions, topic_index=_topic, n=3):
        print(round(p, 4), d)
        print()