In [115]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
import math
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
import pandas as pd
import scipy

import little_mallet_wrapper as lmw

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Clean Data

In [702]:
# 2018

# for each paper -- run once, see which words are not in the dictionary,
# add valid missing words to dictionary
# run again to confirm caught appropriate missing words
# set finish_processing=True to write the cleaned file
def clean_paper_2018(papername, paper_path, finish_processing=False):
    full_filename = os.path.join(papers_path_root + "2018", papername)
    # checking if it is a file, cleaning part done in one place
    if os.path.isfile(full_filename):
        # open file for reading
        f = open(full_filename, 'r')
        file_contents = f.read()
        # for simplicity of running, make sure that after process once,
        # can re-process/ is idempotent
        
        # remove proceedings info from the file; from "Proceedings" through "ABSTRACT"
        cleaned_paper = file_contents
        if "Proceedings" in cleaned_paper: # TODO make sure case insensitive
            res = cleaned_paper.split("Proceedings")
            # keep title
            title = res[0]
            # remove authors, editors, proceedings info
            if "ABSTRACT" in cleaned_paper: # TODO make sure case insensitive
                res = res[1].split("ABSTRACT")
                if title:
                    cleaned_paper = title + res[1]
                else:
                    cleaned_paper = res[1]
        
        # remove acknowledgements
        if 'Acknowledgments' in cleaned_paper:       
            res = cleaned_paper.split('Acknowledgments')
            cleaned_paper = res[0]
        elif 'Acknowledgements'in cleaned_paper:
            res = cleaned_paper.split('Acknowledgements')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGEMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGMENTS')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGEMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGMENTS')
            cleaned_paper = res[0]
        
        # remove references, if the above didn't get them when removed acknowledgements
        if 'References' in cleaned_paper:       
            res = cleaned_paper.split('References')
            cleaned_paper = res[0]
        elif 'REFERENCES'in cleaned_paper:
            res = cleaned_paper.split('REFERENCES')
            cleaned_paper = res[0]
        
        # remove line breaks / i.e. rejoin hyphenated words; i.e., find "-\n"
        cleaned_paper = cleaned_paper.replace("-\n", "")
                
        # remove URLs; this is a quick and dirty way to do it
        cleaned_paper = re.sub(r'http\S+', '', cleaned_paper)
        
        # remove all non-alpha to take care of math; replace with '' for now
        rx = r'[^\w\n\s\-\/]+'
        #rx = r'[^a-zA-Z\n\s\-]'
        cleaned_paper = re.sub(rx, '', cleaned_paper)
        cleaned_paper = re.sub(r'\d', '', cleaned_paper)
        # TODO clean up with dict
        paper_as_tokens = cleaned_paper.split()
        # There is a more efficient way to do this...but I don't care
        missed_words = set()
        print("Words removed from: " + papername)
        for idx, token in enumerate(paper_as_tokens):
            if token.lower() not in dictionary and token.lower() not in missed_words:
                missed_words.add(token.lower())
                print(token.lower())
                # remove bad token
                paper_as_tokens[idx] = ""
        f.close()
    
    #print(cleaned_paper)
    # now overwrite the file
    if os.path.isfile(full_filename):
        print("Writing intermediate out file: {}".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()
    
    # set to true after completing manual cleaning
    if finish_processing and os.path.isfile(full_filename):
        cleaned_paper = ' '.join(paper_as_tokens)
        print("Writing final cleaned file: {}\n".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()


In [1162]:
# 2019 -- copying and modifying above; refactoring this would just be more annoying

# for each paper -- run once, see which words are not in the dictionary,
# add valid missing words to dictionary
# run again to confirm caught appropriate missing words
# set finish_processing=True to write the cleaned file

license1 = """Permission to make digital or hard copies of all or part of this work for personal or
classroom use is granted without fee provided that copies are not made or distributed
for profit or commercial advantage and that copies bear this notice and the full citation
on the first page. Copyrights for components of this work owned by others than ACM
must be honored. Abstracting with credit is permitted. To copy otherwise, or republish,
to post on servers or to redistribute to lists, requires prior specific permission and/or a
fee. Request permissions from permissions@acm.org."""

proceedings = """FAT* ’19, January 29–31, 2019, Atlanta, GA, USA
© 2019 Association for Computing Machinery.
ACM ISBN 978-1-4503-6125-5/19/01. . . $15.00"""

def clean_paper_2019(papername, paper_path, finish_processing=False):
    full_filename = os.path.join(papers_path_root + "2019", papername)
    # checking if it is a file, cleaning part done in one place
    if os.path.isfile(full_filename):
        # open file for reading
        f = open(full_filename, 'r')
        file_contents = f.read()
        # for simplicity of running, make sure that after process once,
        # can re-process/ is idempotent
        
        # remove "ABSTRACT"
        cleaned_paper = file_contents
        if "ABSTRACT" in cleaned_paper: # TODO make sure case insensitive
            res = cleaned_paper.split("ABSTRACT")
            cleaned_paper = res[0] + " " + res[1]
        
        # remove license; this barely works / remove manually
        if license1 in cleaned_paper:
            res = cleaned_paper.split(license1)
            cleaned_paper = res[0] + res[1]
        
        # remove proceedings
        if proceedings in cleaned_paper:
            res = cleaned_paper.split(proceedings)
            cleaned_paper = res[0] + res[1]
        
        # remove acknowledgements
        if 'Acknowledgments' in cleaned_paper:       
            res = cleaned_paper.split('Acknowledgments')
            cleaned_paper = res[0]
        elif 'Acknowledgements'in cleaned_paper:
            res = cleaned_paper.split('Acknowledgements')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGMENTS')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGEMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGEMENTS')
            cleaned_paper = res[0]
        
        # remove references, if the above didn't get them when removed acknowledgements
        if 'References' in cleaned_paper:       
            res = cleaned_paper.split('References')
            cleaned_paper = res[0]
        elif 'REFERENCES'in cleaned_paper:
            res = cleaned_paper.split('REFERENCES')
            cleaned_paper = res[0]
        
        # remove line breaks / i.e. rejoin hyphenated words; i.e., find "-\n"
        cleaned_paper = cleaned_paper.replace("-\n", "")
        # remove ccs
        cleaned_paper = cleaned_paper.replace("CCS", "")
                
        # remove URLs; this is a quick and dirty way to do it
        cleaned_paper = re.sub(r'http\S+', '', cleaned_paper)
        
        # remove all non-alpha to take care of math; replace with '' for now
        rx = r'[^\w\n\s\-\/]+'
        #rx = r'[^a-zA-Z\n\s\-]'
        cleaned_paper = re.sub(rx, '', cleaned_paper)
        cleaned_paper = re.sub(r'\d', '', cleaned_paper)
        
        # remove reference format
        if "ACM Reference Format" in cleaned_paper:
            res = cleaned_paper.split("ACM Reference Format")
            # this line doesn't work; spacing always different; remove manually
            #res2 = res[1].split("""ACM New York NY USA pages""")
            cleaned_paper = res[0] + " " + res[1]
        
        # TODO clean up with dict
        paper_as_tokens = cleaned_paper.split()
        # There is a more efficient way to do this...but I don't care
        # use a different loop for clarity, not efficiency certaintly
        manually_editing = False
        for idx, token in enumerate(paper_as_tokens):
            if token.lower() not in dictionary:
                # check if is a messed up word from rendering
                if token.lower() in messed_up_words:
                    paper_as_tokens[idx] = messed_up_words[token.lower()]
                    manually_editing = True
        
        if manually_editing:
            print("Editing mis-rendered words")
        
        missed_words = set()
        print("Words removed from: " + papername)
        for idx, token in enumerate(paper_as_tokens):
            if token.lower() not in dictionary and token.lower() not in missed_words:
                if token.lower() in garbage:
                    paper_as_tokens[idx] = ""
                else:
                    missed_words.add(token.lower())
                    print(token.lower())
                    # remove bad token
                    paper_as_tokens[idx] = ""
        
        f.close()
        
        #print(cleaned_paper)
    # now overwrite the file
    if os.path.isfile(full_filename):
        print("Writing intermediate out file: {}".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()
    
    # set to true after completing manual cleaning
    if finish_processing and os.path.isfile(full_filename):
        cleaned_paper = ' '.join(paper_as_tokens)
        print("Writing final cleaned file: {}\n".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()



In [1188]:
# 2020 -- copying and modifying above; refactoring this would just be more annoying

# for each paper -- run once, see which words are not in the dictionary,
# add valid missing words to dictionary
# run again to confirm caught appropriate missing words
# set finish_processing=True to write the cleaned file

def clean_paper_2020(papername, paper_path, finish_processing=False):
    full_filename = os.path.join(papers_path_root + "2020", papername)
    # checking if it is a file, cleaning part done in one place
    if os.path.isfile(full_filename):
        # open file for reading
        f = open(full_filename, 'r')
        file_contents = f.read()
        # for simplicity of running, make sure that after process once,
        # can re-process/ is idempotent
        
        # remove "ABSTRACT"
        cleaned_paper = file_contents
        if "ABSTRACT" in cleaned_paper: # TODO make sure case insensitive
            res = cleaned_paper.split("ABSTRACT")
            cleaned_paper = res[0] + " " + res[1]
        
        # remove license; this barely works / remove manually
        if license1 in cleaned_paper:
            res = cleaned_paper.split(license1)
            cleaned_paper = res[0] + res[1]
        
        # remove proceedings
        if proceedings in cleaned_paper:
            res = cleaned_paper.split(proceedings)
            cleaned_paper = res[0] + res[1]
        
        # remove acknowledgements
        if 'Acknowledgments' in cleaned_paper:       
            res = cleaned_paper.split('Acknowledgments')
            cleaned_paper = res[0]
        elif 'Acknowledgements'in cleaned_paper:
            res = cleaned_paper.split('Acknowledgements')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGMENTS')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGEMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGEMENTS')
            cleaned_paper = res[0]
        
        # remove references, if the above didn't get them when removed acknowledgements
        if 'References' in cleaned_paper:       
            res = cleaned_paper.split('References')
            cleaned_paper = res[0]
        elif 'REFERENCES'in cleaned_paper:
            res = cleaned_paper.split('REFERENCES')
            cleaned_paper = res[0]
        
        # remove line breaks / i.e. rejoin hyphenated words; i.e., find "-\n"
        cleaned_paper = cleaned_paper.replace("-\n", "")
        # remove ccs
        cleaned_paper = cleaned_paper.replace("CCS", "")
                
        # remove URLs; this is a quick and dirty way to do it
        cleaned_paper = re.sub(r'http\S+', '', cleaned_paper)
        
        # remove all non-alpha to take care of math; replace with '' for now
        rx = r'[^\w\n\s\-\/]+'
        #rx = r'[^a-zA-Z\n\s\-]'
        cleaned_paper = re.sub(rx, '', cleaned_paper)
        cleaned_paper = re.sub(r'\d', '', cleaned_paper)
        
        # TODO clean up with dict
        paper_as_tokens = cleaned_paper.split()
        # There is a more efficient way to do this...but I don't care
        # use a different loop for clarity, not efficiency certaintly
        manually_editing = False
        for idx, token in enumerate(paper_as_tokens):
            if token.lower() not in dictionary:
                # check if is a messed up word from rendering
                if token.lower() in messed_up_words:
                    paper_as_tokens[idx] = messed_up_words[token.lower()]
                    manually_editing = True
        
        if manually_editing:
            print("Editing mis-rendered words")
        
        missed_words = set()
        print("Words removed from: " + papername)
        for idx, token in enumerate(paper_as_tokens):
            if token.lower() not in dictionary and token.lower() not in missed_words:
                if token.lower() in garbage:
                    paper_as_tokens[idx] = ""
                else:
                    missed_words.add(token.lower())
                    print(token.lower())
                    # remove bad token
                    paper_as_tokens[idx] = ""
        
        f.close()
        
        #print(cleaned_paper)
    # now overwrite the file
    if os.path.isfile(full_filename):
        print("Writing intermediate out file: {}".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()
    
    # set to true after completing manual cleaning
    if finish_processing and os.path.isfile(full_filename):
        cleaned_paper = ' '.join(paper_as_tokens)
        print("Writing final cleaned file: {}\n".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()



In [1324]:
# load up dictionaries

papers_path_root = '../../Data/TXTs/CleanedPapers_LDA/'

dictionary = set()

with open(papers_path_root + "dictionary.txt", encoding="utf-8") as f:
    for line in f:
        word = line.strip()
        if word and "#" not in word:
            dictionary.add(word)
# this sucks but some of the PDFs don't render f's and y's correctly
# this is to clean up commonly messed up words, and sometimes also i
# when it's in close proximity to f or y
messed_up_words = {
                    'ainity': 'affinity',
                    'articial': 'artificial',
                    'aect': 'affect',
                    'aected': 'affected',
                    'aects': 'affects',
                    'aord': 'afford',
                    'aords': 'affords',
                    'aorded': 'afforded',
                    'application-specic': 'application-specific',
                    'artice' : 'artifice',
                    'artices' : 'artifices',
                    'beneted':'benefitted',
                    'benets':'benefits',
                    'benecial':'beneficial',
                    'briey': 'briefly',
                    'briely':'briefly',
                    'classication': 'classification',
                    'classications': 'classifications',
                    'classies': 'classifies',
                    'classied': 'classified',
                    'classier': 'classifier',
                    'classiers': 'classifiers',
                    'coecient':'coefficient',
                    'coecients':'coefficients',
                    'coeicient':'coefficient',
                    'coeicients':'coefficients',
                    'conicting': 'conflicting',
                    'conict': 'conflict',
                    'condence': 'confidence',
                    'condently':'confidently',
                    'condence-interval': 'confidence-interval',
                    'conguration': 'configuration',
                    'congurations': 'configurations',
                    'condentiality': 'confidentiality',
                    'datacation': 'datafication',
                    'decient': 'deficient',
                    'dene': 'define',
                    'dened': 'defined',
                    'deined': 'defined',
                    'dening': 'defining',
                    'denition': 'definition',
                    'deinition': 'definition',
                    'denitions': 'definitions',
                    'deinition': 'definitions',
                    'dicult':'difficult',
                    'diculty':'difficulty',
                    'dier': 'differ',
                    'dierence': 'difference',
                    'dierences': 'differences',
                    'dierent': 'different',
                    'difer':'differ',
                    'diferent': 'different',
                    'dierently': 'differently',
                    'diferently': 'differently',
                    'dierentiation': 'differentiation',
                    'dierential': 'differential',
                    'dierentially': 'differentially',
                    'diversied':'diversified',
                    'diversication':'diversification',
                    'eect':'effect',
                    'ecting'
                    'eective':'effective',
                    'eectively':'effectively',
                    'efectiveness':'effectiveness',
                    'efectively':'effectively',
                    'ecacy': 'efficacy',
                    'eects':'effects',
                    'ecient': 'efficient',
                    'eicient':'efficient',
                    'eiciently':'efficiently',
                    'eciently': 'efficiently',
                    'eciency':'efficiency',
                    'elds': "fields",
                    'eort': 'effort',
                    'eorts': 'efforts',
                    'efort': 'effort',
                    'eforts': 'efforts',
                    'identication': 'identification',
                    'identied': 'identified',
                    'identiied': 'identified',
                    'identies': 'identifies',
                    'identiably': 'identifiably',
                    'ierce':'fierce',
                    'ilter':'filter',
                    'iltered':'filtered',
                    'ilters':'filters',
                    'iltering':'filtering',
                    'lter':'filter',
                    'ltered':'filtered',
                    'lters':'filters',
                    'ltering':'filtering',
                    'ind': 'find',
                    'indierent': 'indifferent',
                    'inecient': 'inefficient',
                    'inds': 'finds',
                    'inding':'finding',
                    'indings':'findings',
                    'ine-grained':'fine-grained',
                    'inluence':'influence',
                    'innite': 'infinite',
                    'insignicant': 'insignificant',
                    'insucient': 'insufficient',
                    'inuence': 'influence',
                    'irst':'first',
                    'ixed': "fixed",
                    'jasano': 'jasanoff',
                    'justication': 'justification',
                    'justications': 'justifications',
                    'justied': 'justified',
                    'modied':'modified',
                    'nancial':'financial',
                    'nding': "ending",
                    'ndings': "endings",
                    'nite': 'finite',
                    'notication': 'notification',
                    'oer': 'offer',
                    'oers': 'offers',
                    'proile': 'profile',
                    'proled': 'profiled',
                    'proling': 'profiling',
                    'quantication': 'quantification',
                    'qualied': 'qualified',
                    'ramications': 'ramifications',
                    'reect':'reeffects',
                    'reected':'reeffected',
                    'reecting':'reeffecting',
                    'rectication': 'rectification',
                    'reects':'reeffects',
                    'reexive': 'reflexive',
                    'satisies': 'satisfies',
                    'satises':'satisfies',
                    'satisied': 'satisfied',
                    'satised': 'satisfied',
                    'scientically': 'scientifically',
                    'scientic': 'scientific',
                    'signicance': 'significance',
                    'signicant': 'significant',
                    'signies': 'signifies',
                    'signicantly': 'significantly',
                    'signiicantly': 'significantly',
                    'simplied':'simplified',
                    'specic': 'specific',
                    'speciic': 'specific',
                    'specied': 'specified',
                    'speciically': 'specifically',
                    'specically': 'specifically',
                    'specicities':'specificities',
                    'specication':'specification',
                    'specications':'specifications',
                    'sub-elds': 'sub-fields',
                    'suces': 'suffices',
                    'sucient': 'sufficient',
                    'stratication': 'stratification',
                    'trade-o':'trade-off',
                    'trade-os':'trade-offs',
                    'unied': 'unified',
                    'user-specic': 'user-specific'
                  }

garbage = {'f', 'rt', 'ic', 'h', 'hh', '-', '/', 'φ', 'dq','js','cs','nmf','vui','nvui','µi','rui',
    'xu','yi','uiκ','κ','iu','ht','uu','ttt','sntv','sisj','τ','wv','plv','tt','ct','-p','lu','xi','xtu','k',
    'ś','σi','j','βi','wherec','cm','β','βn','w','itemsw','iew', 'aren', 'arem', 'anm','avh','hmt','hasv',
    'ci','iem','gmi','gvi','discvi','χ','discc','cici','fi','discci','α','vj','disccj','wα','vti','inwα',
    'andv','χmax','lbi','lbj','ubj','cj','gς','cti','gv','/m','δ','leastmi/','ln/δ','/αmi','leastk','thanmi/',
    'belowmi/','αmi','valuemi/','valuemi','beyondmi/','λ','λti','v/g','oti','minvti','hti','ˆλti','ˆl','ϵδ','λmin',
    'λmax','lng/δ','/ηϵ','tmax','η','ˆλi','λi','ϵ','dtv','dgv','wαd','ηϵ','pof','pofα','w/χ','/pofα','/pof',
    '/mi','µ','µg','civi/mi','vi/mi','vj/mj','mα','mm','m/m','ys','ici','rn','ey','og','jo','yc','eo','ite',
    'fil','ak','ftw','e_','iti','ia','iro','_t','dj','rio','r_','sig','pp' ,'tanh','exp','sдy','πдyy','π',
    'cu','cy','д','yy','sдyy','xri','дд','yyi','xr','ei','gru','eti','tprдy','дy','gapдy','πдy','not-q','λf','iℓx',
    'af','anduf','useuτ','bτ','toufτ','bfτ','fτ','uτ','aτ','px','ττ','tn','τn','fдl','dl','dx','clτ','cдx','κca',
    'uf','ux','cx','ℓx','ℓxℓx','uτ','groupд','bдf','groupb','lд','fд','lτ','maxcl','maxαl','gτ','bдτ','clℓxτ',
    'σ','byti','ϕ','letai','γ','дai','eqal','дa','cdд','ati','bsi','cgi','bxi','cyi','letn','letz','zz','zi',
    'meanm','rninj','σς','ϕx','φx','xt','sβi','tds','limt','pq','σσ','prt','prσ','prд','prsi','pδ','uqδ','prti',
    'sд','pqδ','pqδpqδ','qδ','pq/ur', 'eq','zj','xth','zxj','ui','θi','λθ','θ','ϵi','дxyh','functionд','xy',
    'andh','xyh','hxah','dh','zy','pxy','exy','дwy','дxy','pyy','zhd','zπ','vz','hπ','hx','zw', 'ε','τ/ε',
    'lett','bt','ε/τ','ℓj','uj','ft','εm','rp','sλx','ψi','ℓu','isλ','λrp','дλ','exℑ','ℓi','cov','τsr','τtpr',
    'τfpr','γq','γsrf','τq','xp','pn','zn','iy','ifz','prℑ','toℑ','ifℑ','qℑi','qℑ','qℑf','wrt','qi','qj',
    'ef','xx','sim','xa','xb','fxxn','eterm','ee','ψ','ψδ','δδ', 'дx','αf','дp','βf','дdp','fpdд','ℓ','hxi', 'µw',
    'ˆδw','ˆδ','µu','µa','µb', 'nw','mxi','eε','vε','mx','dy','αi','pnw','iyi','ηxk','pxi','qti','xk', 'ixi','xm',
    'çx','pxdb','ckb','bx','ppxda','cka','πt','τt','βt','дj','βg','βv','βa','βb','πa','πb','ϵat','ϵbt',
    'bϵbt','πtπt','fβt','πtt','dπt','dπ','δiπei','δi','δk','δiδi','πei','πbaa','дau/дau','/α','πta','faπt',
    'aπt','πtb','taπbπa','πbπa','faπ','lπ','ππ','faππ','faπtπ','πtπ', '𝜒','𝛼','𝑇𝐹','𝑚','𝑙𝑖','𝑙𝑗','𝑡𝑖','𝜎','𝑖-th',
    '𝑖','𝑚𝑛','𝑓','𝑄','𝑋','𝑡','𝜖','𝐸','iff','𝑅','𝐿','𝑛','φ𝑥','𝜙','𝑗','𝑎','𝑏','𝜌','𝑙','𝑥'
           
          }

In [1335]:
# We need to clean one at a time to make sure that we are spot-checking everything, sadly. I will do like 30 a day
# using this a tracker, but also will keep a csv
# when we are down, I will remove this, having updated the code above to clean everything appropriately
# and will implement a for-loop for reproducibility

# this is imperfect; we end up with some math stuff and weird spacing from the tables. But I think overall
# it does a pretty good job

restart = False

if restart:
    # 2018: papers 1-15 / 185
    clean_paper_2018("binns18a", papers_path_root, True) # 1
    clean_paper_2018("barabas18a", papers_path_root, True) # 2
    clean_paper_2018("buolamwini18a", papers_path_root, True) # 3
    clean_paper_2018("burke18a", papers_path_root, True) # 4
    clean_paper_2018("chouldechova18a", papers_path_root, True) # 5
    clean_paper_2018("datta18a", papers_path_root, True) # 6
    clean_paper_2018("dwork18a", papers_path_root, True) #7
    clean_paper_2018("ekstrand18a", papers_path_root, True) # 8 
    clean_paper_2018("ekstrand18b", papers_path_root, True) # 9
    clean_paper_2018("ensign18a", papers_path_root, True) # 10
    clean_paper_2018("kamishima18a", papers_path_root, True) # 11
    clean_paper_2018("madaan18a", papers_path_root, True) # 12
    clean_paper_2018("menon18a", papers_path_root, True) # 13
    clean_paper_2018("phillips18a", papers_path_root, True) # 14
    clean_paper_2018("speicher18a", papers_path_root, True) # 15
    # 2019: papers 16-53 / 185
    # manually remove author list for each 2019 paper; ACM reference format;
    # license, sometimes page headers (search on "Atlanta")
    clean_paper_2019("3287560.3287561.txt", papers_path_root, True) # 16 
    clean_paper_2019("3287560.3287562.txt", papers_path_root, True) # 17     
    clean_paper_2019("3287560.3287563.txt", papers_path_root, True) # 18 
    clean_paper_2019("3287560.3287564.txt", papers_path_root, True) # 19
    clean_paper_2019("3287560.3287565.txt", papers_path_root, True) # 20
    clean_paper_2019("3287560.3287566.txt", papers_path_root, True) # 21
    clean_paper_2019("3287560.3287567.txt", papers_path_root, True) # 22
    clean_paper_2019("3287560.3287568.txt", papers_path_root, True) # 23
    clean_paper_2019("3287560.3287569.txt", papers_path_root, True) # 24
    clean_paper_2019("3287560.3287570.txt", papers_path_root, True) # 25
    clean_paper_2019("3287560.3287571.txt", papers_path_root, True) # 26
    clean_paper_2019("3287560.3287572.txt", papers_path_root, True) # 27
    clean_paper_2019("3287560.3287573.txt", papers_path_root, True) # 28
    clean_paper_2019("3287560.3287574.txt", papers_path_root, True) # 29
    clean_paper_2019("3287560.3287575.txt", papers_path_root, True) # 30
    clean_paper_2019("3287560.3287576.txt", papers_path_root, True) # 31
    clean_paper_2019("3287560.3287577.txt", papers_path_root, True) # 32
    clean_paper_2019("3287560.3287578.txt", papers_path_root, True) # 33
    clean_paper_2019("3287560.3287579.txt", papers_path_root, True) # 34
    clean_paper_2019("3287560.3287580.txt", papers_path_root, True) # 35
    clean_paper_2019("3287560.3287583.txt", papers_path_root, True) # 36
    clean_paper_2019("3287560.3287584.txt", papers_path_root, True) # 37
    clean_paper_2019("3287560.3287585.txt", papers_path_root, True) # 38
    clean_paper_2019("3287560.3287586.txt", papers_path_root, True) # 39
    clean_paper_2019("3287560.3287587.txt", papers_path_root, True) # 40
    clean_paper_2019("3287560.3287588.txt", papers_path_root, True) # 41
    clean_paper_2019("3287560.3287589.txt", papers_path_root, True) # 42
    clean_paper_2019("3287560.3287590.txt", papers_path_root, True) # 43
    clean_paper_2019("3287560.3287591.txt", papers_path_root, True) # 44
    clean_paper_2019("3287560.3287592.txt", papers_path_root, True) # 45
    clean_paper_2019("3287560.3287594.txt", papers_path_root, True) # 46
    clean_paper_2019("3287560.3287595.txt", papers_path_root, True) # 47
    clean_paper_2019("3287560.3287596.txt", papers_path_root, True) # 48
    clean_paper_2019("3287560.3287597.txt", papers_path_root, True) # 49
    clean_paper_2019("3287560.3287598.txt", papers_path_root, True) # 50
    clean_paper_2019("3287560.3287599.txt", papers_path_root, True) # 51
    clean_paper_2019("3287560.3287600.txt", papers_path_root, True) # 52
    clean_paper_2019("3287560.3287601.txt", papers_path_root, True) # 53
    # 2020: papers 54-114 / 185
    # manually remove author list for each 2019 paper; ACM reference format;
    # license, sometimes page headers (search on "Barcelona")
    clean_paper_2020("3351095.3372824.txt", papers_path_root, True) # 54
    clean_paper_2020("3351095.3372826.txt", papers_path_root, True) # 55
    clean_paper_2020("3351095.3372827.txt", papers_path_root, True) # 56
    clean_paper_2020("3351095.3372828.txt", papers_path_root, True) # 57
    clean_paper_2020("3351095.3372829.txt", papers_path_root, True) # 58
    clean_paper_2020("3351095.3372830.txt", papers_path_root, True) # 59
    clean_paper_2020("3351095.3372831.txt", papers_path_root, True) # 60
    clean_paper_2020("3351095.3372832.txt", papers_path_root, True) # 61
    clean_paper_2020("3351095.3372833.txt", papers_path_root, True) # 62
    clean_paper_2020("3351095.3372834.txt", papers_path_root, True) # 63
    
#clean_paper_2020("3351095.3372824.txt", papers_path_root) #




#fns = os.listdir(papers_path_root + "2020")
#fns.sort()
#for fn in fns:
#    print(papers_path_root + "2020/" + fn)
    

Words removed from: 3351095.3372834.txt
Writing intermediate out file: 3351095.3372834.txt
Writing final cleaned file: 3351095.3372834.txt



<br><br>

# Load data

In [None]:
year = "2018"

def make_paper_df(year, root_dir_path):
    dir_name = papers_path_root + year
    i = 1
    data = []
    for filename in os.listdir(dir_name):
        full_filename = os.path.join(dir_name, filename)
        # checking if it is a file
        if os.path.isfile(full_filename):
            f = open(full_filename, 'r')
            file_contents = f.read()
            #print(file_contents)
            paper_id = "{}-{}".format(year, i)
            data.append([paper_id, year, filename, file_contents])
            f.close()
            i = i + 1

    return pd.DataFrame(data, columns = ['ID', 'Year', 'FileName', 'Text'])


df_2018 = make_paper_df('2018', papers_path_root)
df_2019 = make_paper_df('2019', papers_path_root)
df_2020 = make_paper_df('2020', papers_path_root)
df_2021 = make_paper_df('2021', papers_path_root)

df_all_papers = df_2018.append(df_2019).append(df_2020).append(df_2021)
df_all_papers.reset_index(inplace=True, drop=True) 

print("2018 total papers: {}".format(len(df_2018.index)))
print("2019 total papers: {}".format(len(df_2019.index)))
print("2020 total papers: {}".format(len(df_2020.index)))
print("2021 total papers: {}".format(len(df_2021.index)))
print("Total papers: {}".format(len(df_all_papers.index)))
    
        

In [None]:
df_all_papers.sample(3)

<br><br>

# Try topic modeling

Process each paper into chunks and keep track of the year for each chunk.

In [None]:
training_texts = []
original_texts = []
training_years = []
# If want to do topic breakdown / distribution per paper, then add training_ids list
chunk_size = 200
min_leftover_chunk_size = 20

for i, r in df_all_papers.iterrows():
    _chunks = [' '.join(r['Text'].split()[i:i+200]) for i in range(0, len(r['Text'].split()), chunk_size)]
    # TODO -- consider custom stop words?
    _processed_chunks = [lmw.process_string(c, remove_stop_words=True, remove_short_words=False).strip() for c in _chunks]
    _processed_chunks = [c for c in _processed_chunks if len(c.split()) >= min_leftover_chunk_size]

    for i, c in enumerate(_processed_chunks):
        training_texts.append(c)
        original_texts.append(_chunks[i])
        training_years.append(r['Year'])

len(training_texts), len(training_years)

In [None]:
num_topics = 20  
output_directory_path = 'output' 
#path_to_mallet = '/Volumes/Passport-1/packages/mallet-2.0.8/bin/mallet'
path_to_mallet = "~/mallet-2.0.8/bin/mallet"


In [None]:
topic_keys, topic_distributions = lmw.quick_train_topic_model(path_to_mallet, 
                                                              output_directory_path, 
                                                              num_topics, 
                                                              training_texts)

In [None]:
assert(len(topic_distributions) == len(training_texts))

In [None]:
for i, t in enumerate(topic_keys):
    print(i, '\t', ' '.join(t[:20]))

In [None]:
for _topic in range(0, num_topics):
    print('---------------------------------------------------------------------------')
    print('TOPIC ' + str(_topic) + ': ' + ' '.join(topic_keys[_topic][:5]))
    print('---------------------------------------------------------------------------')
    print()
    for p, d in lmw.get_top_docs(original_texts, topic_distributions, topic_index=_topic, n=3):
        print(round(p, 4), d)
        print()