In [259]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
import math
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
import pandas as pd
import scipy

import little_mallet_wrapper as lmw

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Clean Data

In [326]:
papers_path_root = '../../Data/TXTs/CleanedPapers_LDA/'

dictionary = set()

with open(papers_path_root + "dictionary.txt", encoding="utf-8") as f:
    for line in f:
        word = line.strip()
        if word and "#" not in word:
            dictionary.add(word)

# for each paper -- run once, see which words are not in the dictionary,
# add valid missing words to dictionary
# run again to confirm caught appropriate missing words
# set finish_processing=True to write the cleaned file
def clean_paper(papername, paper_path, finish_processing=False):
    full_filename = os.path.join(dir_name, papername)
    # checking if it is a file, cleaning part done in one place
    if os.path.isfile(full_filename):
        # open file for reading
        f = open(full_filename, 'r')
        file_contents = f.read()
        # for simplicity of running, make sure that after process once,
        # can re-process/ is idempotent
        
        # remove proceedings info from the file; from "Proceedings" through "ABSTRACT"
        cleaned_paper = file_contents
        if "Proceedings" in cleaned_paper: # TODO make sure case insensitive
            res = cleaned_paper.split("Proceedings")
            # keep title
            title = res[0]
            # remove authors, editors, proceedings info
            if "ABSTRACT" in cleaned_paper: # TODO make sure case insensitive
                res = res[1].split("ABSTRACT")
                if title:
                    cleaned_paper = title + res[1]
                else:
                    cleaned_paper = res[1]
        
        # remove acknowledgements
        if 'Acknowledgments' in cleaned_paper:       
            res = cleaned_paper.split('Acknowledgments')
            cleaned_paper = res[0]
        elif 'Acknowledgements'in cleaned_paper:
            res = cleaned_paper.split('Acknowledgements')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGMENTS')
            cleaned_paper = res[0]
        elif 'ACKNOWLEDGMENTS' in cleaned_paper:
            res = cleaned_paper.split('ACKNOWLEDGMENTS')
            cleaned_paper = res[0]
        
        # remove references, if the above didn't get them when removed acknowledgements
        if 'References' in cleaned_paper:       
            res = cleaned_paper.split('References')
            cleaned_paper = res[0]
        elif 'REFERENCES'in cleaned_paper:
            res = cleaned_paper.split('REFERENCES')
            cleaned_paper = res[0]
        
        # remove line breaks / i.e. rejoin hyphenated words; i.e., find "-\n"
        cleaned_paper = cleaned_paper.replace("-\n", "")
                
        # remove URLs; this is a quick and dirty way to do it
        cleaned_paper = re.sub(r'http\S+', '', cleaned_paper)
        
        # remove all non-alpha to take care of math; replace with '' for now
        rx = r'[^\w\n\s\-\/]+'
        #rx = r'[^a-zA-Z\n\s\-]'
        cleaned_paper = re.sub(rx, '', cleaned_paper)
        cleaned_paper = re.sub(r'\d', '', cleaned_paper)
        # TODO clean up with dict
        paper_as_tokens = cleaned_paper.split()
        # There is a more efficient way to do this...but I don't care
        missed_words = set()
        print("Words removed from: " + papername)
        for idx, token in enumerate(paper_as_tokens):
            if token.lower() not in dictionary and token.lower() not in missed_words:
                missed_words.add(token.lower())
                print(token.lower())
                # remove bad token
                paper_as_tokens[idx] = ""
        f.close()
    
    #print(cleaned_paper)
    # now overwrite the file
    if os.path.isfile(full_filename):
        print("Writing intermediate out file: {}".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()
    
    # set to true after completing manual cleaning
    if finish_processing and os.path.isfile(full_filename):
        cleaned_paper = ' '.join(paper_as_tokens)
        print("Writing final cleaned file: {}\n".format(papername))
        f = open(full_filename, 'w')
        f.write(cleaned_paper)
        f.close()


In [354]:
# We need to clean one at a time to make sure that we are spot-checking everything, sadly. I will do like 30 a day
# using this a tracker, but also will keep a csv
# when we are down, I will remove this, having updated the code above to clean everything appropriately
# and will implement a for-loop for reproducibility

# this is imperfect; we end up with some math stuff and weird spacing from the tables. But I think overall
# it does a pretty good job

# 2018
clean_paper("binns18a", papers_path_root, True) # 1
clean_paper("barabas18a", papers_path_root, True) # 2
clean_paper("buolamwini18a", papers_path_root, True) # 3
clean_paper("burke18a", papers_path_root, True) # 4
clean_paper("chouldechova18a", papers_path_root, True) # 5
#clean_paper("datta18a", papers_path_root) # 6 TODO may have some manual math and table cleanup
#clean_paper("dwork18a", papers_path_root) #7 TODO may have some manual math and table cleanup
#clean_paper("ekstrand18a", papers_path_root) # 8 
#clean_paper("ekstrand18b", papers_path_root) # 9 TODO may have some manual math and table cleanup
#clean_paper("ensign18a", papers_path_root) # 10 TODO may have some manual math and table cleanup
#clean_paper("kamishima18a", papers_path_root) # 11 TODO may have some manual math and table cleanup
#clean_paper("madaan18a", papers_path_root) # 12 TODO may have some manual math and table cleanup
#clean_paper("menon18a", papers_path_root) # 13 TODO may have some manual math and table cleanup
#clean_paper("phillips18a", papers_path_root) # 14 TODO may have some manual math and table cleanup
#clean_paper("speicher18a", papers_path_root) # 15 TODO may have some manual math and table cleanup

Words removed from: binns18a
Missed words from binns18a: set()
Writing intermediate out file: binns18a
Writing final cleaned file: binns18a

Words removed from: barabas18a
Missed words from barabas18a: set()
Writing intermediate out file: barabas18a
Writing final cleaned file: barabas18a

Words removed from: buolamwini18a
Missed words from buolamwini18a: set()
Writing intermediate out file: buolamwini18a
Writing final cleaned file: buolamwini18a

Words removed from: burke18a
Missed words from burke18a: set()
Writing intermediate out file: burke18a
Writing final cleaned file: burke18a

Words removed from: chouldechova18a
Missed words from chouldechova18a: set()
Writing intermediate out file: chouldechova18a
Writing final cleaned file: chouldechova18a



<br><br>

# Load data

In [4]:
year = "2018"

def make_paper_df(year, root_dir_path):
    dir_name = papers_path_root + year
    i = 1
    data = []
    for filename in os.listdir(dir_name):
        full_filename = os.path.join(dir_name, filename)
        # checking if it is a file
        if os.path.isfile(full_filename):
            f = open(full_filename, 'r')
            file_contents = f.read()
            #print(file_contents)
            paper_id = "{}-{}".format(year, i)
            data.append([paper_id, year, filename, file_contents])
            f.close()
            i = i + 1

    return pd.DataFrame(data, columns = ['ID', 'Year', 'FileName', 'Text'])


df_2018 = make_paper_df('2018', papers_path_root)
df_2019 = make_paper_df('2019', papers_path_root)
df_2020 = make_paper_df('2020', papers_path_root)
df_2021 = make_paper_df('2021', papers_path_root)

df_all_papers = df_2018.append(df_2019).append(df_2020).append(df_2021)
df_all_papers.reset_index(inplace=True, drop=True) 

print("2018 total papers: {}".format(len(df_2018.index)))
print("2019 total papers: {}".format(len(df_2019.index)))
print("2020 total papers: {}".format(len(df_2020.index)))
print("2021 total papers: {}".format(len(df_2021.index)))
print("Total papers: {}".format(len(df_all_papers.index)))
    
        

2018 total papers: 15
2019 total papers: 38
2020 total papers: 61
2021 total papers: 71
Total papers: 185


In [37]:
df_all_papers.sample(3)

Unnamed: 0,ID,Year,FileName,Text
105,2020-53,2020,3351095.3372865.txt,Data in New Delhi‚Äôs Predictive Policing System...
92,2020-40,2020,3351095.3372852.txt,Effect of Confidence and Explanation on Accura...
130,2021-17,2021,3442188.3445916,Censorship of Online Encyclopedias: Implicatio...


<br><br>

# Try topic modeling

Process each paper into chunks and keep track of the year for each chunk.

In [40]:
training_texts = []
original_texts = []
training_years = []
# If want to do topic breakdown / distribution per paper, then add training_ids list
chunk_size = 200
min_leftover_chunk_size = 20

for i, r in df_all_papers.iterrows():
    _chunks = [' '.join(r['Text'].split()[i:i+200]) for i in range(0, len(r['Text'].split()), chunk_size)]
    # TODO -- consider custom stop words?
    _processed_chunks = [lmw.process_string(c, remove_stop_words=True, remove_short_words=False).strip() for c in _chunks]
    _processed_chunks = [c for c in _processed_chunks if len(c.split()) >= min_leftover_chunk_size]

    for i, c in enumerate(_processed_chunks):
        training_texts.append(c)
        original_texts.append(_chunks[i])
        training_years.append(r['Year'])

len(training_texts), len(training_years)

(9790, 9790)

In [41]:
num_topics = 20  
output_directory_path = 'output' 
#path_to_mallet = '/Volumes/Passport-1/packages/mallet-2.0.8/bin/mallet'
path_to_mallet = "~/mallet-2.0.8/bin/mallet"


In [46]:
topic_keys, topic_distributions = lmw.quick_train_topic_model(path_to_mallet, 
                                                              output_directory_path, 
                                                              num_topics, 
                                                              training_texts)

Importing data...
Complete
Training topic model...
Complete


In [47]:
assert(len(topic_distributions) == len(training_texts))

In [48]:
for i, t in enumerate(topic_keys):
    print(i, '\t', ' '.join(t[:20]))

0 	 NUM dataset figure bias data results datasets accuracy two set group used model gender participants number table using groups different
1 	 NUM users user news recommendation items content recommendations recommender item search diversity articles ranking amazon alt based media online information
2 	 NUM virtual event march facct canada table figure taken value lue average –¥NUM another mean shows way got estimate total
3 	 data research NUM public collection sharing community media health india use privacy technology access participants researchers open communities information mental
4 	 NUM https doi org/NUM www journal NUM/NUM proceedings press new conference acm social data review science human research arxiv university
5 	 word gender latexit bias embeddings sentiment language biases shaNUM words baseNUM wikipedia female male text positive negative social embedding texts
6 	 trust human risk model insurance NUM/NUM trustworthy trustworthiness high technologies low user public 

In [45]:
for _topic in range(0, num_topics):
    print('---------------------------------------------------------------------------')
    print('TOPIC ' + str(_topic) + ': ' + ' '.join(topic_keys[_topic][:5]))
    print('---------------------------------------------------------------------------')
    print()
    for p, d in lmw.get_top_docs(original_texts, topic_distributions, topic_index=_topic, n=3):
        print(round(p, 4), d)
        print()

---------------------------------------------------------------------------
TOPIC 0: NUM model explanations explanation models
---------------------------------------------------------------------------

0.7959 How can I choose an explainer? An Application-grounded Evaluation of Post-hoc Explanations S√©rgio Jesus Feedzai, DCC-FCUP Universidade do Porto Catarina Bel√©m Feedzai catarina.belem@feedzai.com Vladimir Balayan Feedzai vladimir.balayan@feedzai.com Jo√£o Bento Feedzai joao.bento@feedzai.com Pedro Saleiro Feedzai pedro.saleiro@feedzai.com Pedro Bizarro Feedzai pedro.bizarro@feedzai.com Jo√£o Gama LIAAD, INESCTEC Universidade do Porto ABSTRACT There have been several research works proposing new Explainable AI (XAI) methods designed to generate model explanations having specific properties, or desiderata, such as fidelity, robustness, or human-interpretability. However, explanations are seldom evalu- ated based on their true practical impact on decision-making tasks. Without that