# Filtering papers by abstract contents

With the abstracts in hand, we can start looking for keywords that pertain to AI ethics, algorithmic fairness, data privacy, and other topics. To do this, we have made 3 lists of keywords across different topics. If at least one word from each list is present in the abstract, that paper will pass through the filter. To generate these keyword lists, we performed TF-IDF analysis on the focused dataset, and created 3-person consensus on whether to keep or remove certain words which had the 100 highest median scores across the corpus. Afterwards, we will use the same keyword lists as a filter on the full dataset containing all the abstracts saved in MAG.

In [1]:
import pandas as pd
import numpy as np

In [17]:
def listAFilter(df):
    al = np.array_split(df, 50)
    i = 1
    res = pd.DataFrame()
    for a in al:
        a['Indexed Abstract'] = a['Indexed Abstract'].apply(str.lower)
        a['A'] = a['Indexed Abstract'].str.contains('|'.join(keyA))
        res = pd.concat([res, a])
        if i % 10 == 0:
            print('list ' + str(i) + ' checked')
    return res

def listBFilter(df):
    al = np.array_split(df, 50)
    i = 1
    res = pd.DataFrame()
    for a in al:
        a['B'] = a['Indexed Abstract'].str.contains('|'.join(keyB))
        res = pd.concat([res, a])
        if i % 10 == 0:
            print('list ' + str(i) + ' checked')
    return res

def listCFilter(df):
    al = np.array_split(df, 50)
    i = 1
    res = pd.DataFrame()
    for a in al:
        a['C'] = a['Indexed Abstract'].str.contains('|'.join(keyC))
        res = pd.concat([res, a])
        if i % 10 == 0:
            print('list ' + str(i) + ' checked')
    return res

## Using keywords to filter full set of abstracts

In [3]:
import numpy as np
import pandas as pd

In [4]:
#read paper indexed abstracts
abstract = pd.read_csv('/bgfs/mrfrank/datawarehouse/MAG/NEW_MAG/PaperAbstractsInvertedIndex.txt',
                       sep = '\t',
                       names = ['Paper ID', 'Indexed Abstract'],
                      dtype = {'Paper ID': np.uint32})

In [19]:
abstract

Unnamed: 0,Paper ID,Indexed Abstract,A,B
28,1963479539,"{""indexlength"":141,""invertedindex"":{""over"":[0]...",True,True
193,1963479783,"{""indexlength"":214,""invertedindex"":{""managing""...",True,True
542,2095588949,"{""indexlength"":141,""invertedindex"":{""education...",True,True
549,1963480270,"{""indexlength"":171,""invertedindex"":{""abstract""...",True,True
827,2335780457,"{""indexlength"":133,""invertedindex"":{""the"":[0],...",True,True
...,...,...,...,...
86501793,2172300558,"{""indexlength"":143,""invertedindex"":{""with"":[0]...",True,True
86502047,27485236,"{""indexlength"":356,""invertedindex"":{""the"":[0,1...",True,True
86502113,2095588424,"{""indexlength"":298,""invertedindex"":{""the"":[0,3...",True,True
86502117,1608941198,"{""indexlength"":607,""invertedindex"":{""abstract\...",True,True


In [5]:
keyA = [
    'ethic',
    'ethos',
    'moral',
    'value',
    'code',
    'principle',
    'bias', 
    'rational', 
    'honest', 
    'integrity',
    'philosoph',
    'logic'
]
keyB = [
    'comput',
    'artificial',
    'intelligen',
    'machine learning',
    'algorithm',
    'tech',
    'robot',
    'cyber',
    'information',
    'system',
    '"ai"',
    '\(ai\)'
]
keyC = [
    'social',
    'society',
    'responsib',
    'law',
    'legal',
    'priva', 
    'meaningful',
    'impartial',
    'safe',
    'trust',
    'fair',
    'just',
    'govern',
    'regulat',
    'rule'
]

In [7]:
abl = np.array_split(abstract, 10)

In [8]:
resA = []
i = 1
for a in abl:
    resA.append(listAFilter(a))
    print('dataframe ' + str(i) + ' read')
    i += 1

list A checked
dataframe 1 read
list A checked
dataframe 2 read
list A checked
dataframe 3 read
list A checked
dataframe 4 read
list A checked
dataframe 5 read
list A checked
dataframe 6 read
list A checked
dataframe 7 read
list A checked
dataframe 8 read
list A checked
dataframe 9 read
list A checked
dataframe 10 read


In [9]:
filtA = pd.concat(resA)

In [10]:
filtA

Unnamed: 0,Paper ID,Indexed Abstract,A
0,2514067917,"{""indexlength"":138,""invertedindex"":{""a"":[0],""s...",False
1,2196491992,"{""indexlength"":7,""invertedindex"":{""早期発見と治療成績の向...",False
2,1963479517,"{""indexlength"":141,""invertedindex"":{""this"":[0]...",False
3,1963479518,"{""indexlength"":147,""invertedindex"":{""the"":[0,7...",False
4,1963479519,"{""indexlength"":242,""invertedindex"":{""cleavage""...",False
...,...,...,...
86502167,2172300924,"{""indexlength"":148,""invertedindex"":{""curing"":[...",False
86502168,2394483106,"{""indexlength"":161,""invertedindex"":{""objective...",False
86502169,2781472873,"{""indexlength"":197,""invertedindex"":{""an"":[0],""...",False
86502170,350727719,"{""indexlength"":160,""invertedindex"":{""the"":[0],...",False


We retain just under a quarter of the full set of abstracts after the first list.

In [11]:
Apass = filtA[filtA['A']]

In [12]:
Apass

Unnamed: 0,Paper ID,Indexed Abstract,A
10,1819117828,"{""indexlength"":207,""invertedindex"":{""in"":[0,45...",True
18,2347208820,"{""indexlength"":82,""invertedindex"":{""agricultur...",True
22,2347208823,"{""indexlength"":39,""invertedindex"":{""susceptibi...",True
25,2342385460,"{""indexlength"":312,""invertedindex"":{""objective...",True
27,2095588557,"{""indexlength"":181,""invertedindex"":{""the"":[0,3...",True
...,...,...,...
86502133,2095588458,"{""indexlength"":91,""invertedindex"":{""summary"":[...",True
86502135,2273582401,"{""indexlength"":149,""invertedindex"":{""neutrophi...",True
86502137,2172300891,"{""indexlength"":181,""invertedindex"":{""filtering...",True
86502143,2419678452,"{""indexlength"":214,""invertedindex"":{""disclosed...",True


In [13]:
Alist = np.array_split(Apass, 10)

In [14]:
resB = []
i = 1
for a in Alist:
    resB.append(listBFilter(a))
    print('dataframe ' + str(i) + ' read')
    i += 1

list B checked
dataframe 1 read
list B checked
dataframe 2 read
list B checked
dataframe 3 read
list B checked
dataframe 4 read
list B checked
dataframe 5 read
list B checked
dataframe 6 read
list B checked
dataframe 7 read
list B checked
dataframe 8 read
list B checked
dataframe 9 read
list B checked
dataframe 10 read


In [15]:
filtB = pd.concat(resB)
filtB

Unnamed: 0,Paper ID,Indexed Abstract,A,B
10,1819117828,"{""indexlength"":207,""invertedindex"":{""in"":[0,45...",True,True
18,2347208820,"{""indexlength"":82,""invertedindex"":{""agricultur...",True,False
22,2347208823,"{""indexlength"":39,""invertedindex"":{""susceptibi...",True,False
25,2342385460,"{""indexlength"":312,""invertedindex"":{""objective...",True,False
27,2095588557,"{""indexlength"":181,""invertedindex"":{""the"":[0,3...",True,True
...,...,...,...,...
86502133,2095588458,"{""indexlength"":91,""invertedindex"":{""summary"":[...",True,False
86502135,2273582401,"{""indexlength"":149,""invertedindex"":{""neutrophi...",True,False
86502137,2172300891,"{""indexlength"":181,""invertedindex"":{""filtering...",True,True
86502143,2419678452,"{""indexlength"":214,""invertedindex"":{""disclosed...",True,True


In [16]:
Bpass = filtB[filtB['B']]

In [17]:
Bpass

Unnamed: 0,Paper ID,Indexed Abstract,A,B
10,1819117828,"{""indexlength"":207,""invertedindex"":{""in"":[0,45...",True,True
27,2095588557,"{""indexlength"":181,""invertedindex"":{""the"":[0,3...",True,True
28,1963479539,"{""indexlength"":141,""invertedindex"":{""over"":[0]...",True,True
42,2095588566,"{""indexlength"":151,""invertedindex"":{""based"":[0...",True,True
48,2252279066,"{""indexlength"":542,""invertedindex"":{""today"":[0...",True,True
...,...,...,...,...
86502113,2095588424,"{""indexlength"":298,""invertedindex"":{""the"":[0,3...",True,True
86502122,2250173118,"{""indexlength"":44,""invertedindex"":{""ve"":[0],""s...",True,True
86502132,129570569,"{""indexlength"":55,""invertedindex"":{""in"":[0],""t...",True,True
86502137,2172300891,"{""indexlength"":181,""invertedindex"":{""filtering...",True,True


In [18]:
Blist = np.array_split(Bpass, 10)

In [19]:
resC = []
i = 1
for a in Blist:
    resC.append(listCFilter(a))
    print('dataframe ' + str(i) + ' read')
    i += 1

list C checked
dataframe 1 read
list C checked
dataframe 2 read
list C checked
dataframe 3 read
list C checked
dataframe 4 read
list C checked
dataframe 5 read
list C checked
dataframe 6 read
list C checked
dataframe 7 read
list C checked
dataframe 8 read
list C checked
dataframe 9 read
list C checked
dataframe 10 read


In [20]:
absfC = pd.concat(resC)
absfC

Unnamed: 0,Paper ID,Indexed Abstract,A,B,C
10,1819117828,"{""indexlength"":207,""invertedindex"":{""in"":[0,45...",True,True,False
27,2095588557,"{""indexlength"":181,""invertedindex"":{""the"":[0,3...",True,True,False
28,1963479539,"{""indexlength"":141,""invertedindex"":{""over"":[0]...",True,True,False
42,2095588566,"{""indexlength"":151,""invertedindex"":{""based"":[0...",True,True,True
48,2252279066,"{""indexlength"":542,""invertedindex"":{""today"":[0...",True,True,True
...,...,...,...,...,...
86502113,2095588424,"{""indexlength"":298,""invertedindex"":{""the"":[0,3...",True,True,False
86502122,2250173118,"{""indexlength"":44,""invertedindex"":{""ve"":[0],""s...",True,True,False
86502132,129570569,"{""indexlength"":55,""invertedindex"":{""in"":[0],""t...",True,True,False
86502137,2172300891,"{""indexlength"":181,""invertedindex"":{""filtering...",True,True,True


In [21]:
Cpass = absfC[absfC['C']]
Cpass.set_index('Paper ID', inplace = True)

In [22]:
Cpass

Unnamed: 0_level_0,Indexed Abstract,A,B,C
Paper ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2095588566,"{""indexlength"":151,""invertedindex"":{""based"":[0...",True,True,True
2252279066,"{""indexlength"":542,""invertedindex"":{""today"":[0...",True,True,True
2505312253,"{""indexlength"":1508,""invertedindex"":{""issn:"":[...",True,True,True
2552604832,"{""indexlength"":240,""invertedindex"":{""summary"":...",True,True,True
1963480011,"{""indexlength"":149,""invertedindex"":{""street"":[...",True,True,True
...,...,...,...,...
2394482995,"{""indexlength"":199,""invertedindex"":{""an"":[0],""...",True,True,True
2240119898,"{""indexlength"":605,""invertedindex"":{""introduct...",True,True,True
27485236,"{""indexlength"":356,""invertedindex"":{""the"":[0,1...",True,True,True
2232604002,"{""indexlength"":253,""invertedindex"":{""accurate""...",True,True,True


In [23]:
Cpass.to_csv('/bgfs/mrfrank/nolan/threelistfilterAllPapers.csv')