In [1]:
import re             # regular expressions
import string         # string operations
import os             # access directories
import pandas as pd   # dataframes
from tqdm import tqdm # create progress bar (for i in tqdm(list))
tqdm.pandas()         # use progress_apply() instead of apply() for progress bar on pandas operations
import spacy          # Natural Language Processing library
from spaczz.matcher import FuzzyMatcher # fuzzy keyword matching

os.chdir('../Data/UoE')

# Load and prepare data

In [5]:
df = pd.read_csv('UoE_staff_publications_theses.csv')
df['abstracts_matching'] = ' ' + df.abstracts.str.replace('[{}]'.format(string.punctuation), ' ', regex = True).str.lower() + ' '
df.head()

Unnamed: 0,id,college,department,name,role,profile,namelast,namefirst,identifier1,identifier2,publications,authors,dates,titles,abstracts,urls,fields,types,abstracts_matching
0,0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"'Fezzi, Carlo', 'Bateman, IJ'",2015-02-04,The Impact of Climate Change on Agriculture: N...,Ricardian (hedonic) analyses of the impact of ...,https://ore.exeter.ac.uk/repository/handle/108...,Economics,article,ricardian hedonic analyses of the impact of...
1,0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"'Bateman, IJ', 'Harwood, Amii R.', 'Mace, Geor...",2013-10-25,Ecosystem services: response,,https://ore.exeter.ac.uk/repository/handle/108...,Economics,article,
2,0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"'Bateman, IJ', 'Agarwala, M', ""Bad'ura, T""",2014-01-08,Pollinator declines: Avoid pitfalls of consens...,,https://ore.exeter.ac.uk/repository/handle/108...,Economics,article,
3,0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"'Bateman, IJ', 'Harwood, Amii R.', 'Mace, Geor...",2013-07-05,Bringing ecosystem services into economic deci...,Landscapes generate a wide range of valuable e...,https://ore.exeter.ac.uk/repository/handle/108...,Economics,article,landscapes generate a wide range of valuable ...
4,0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"'Bateman, IJ', 'Agarwala, M', 'Binner, A', 'Co...",2016-06-22,Spatially explicit integrated modeling and eco...,We present an integrated model of the direct c...,https://ore.exeter.ac.uk/repository/handle/108...,Economics,article,we present an integrated model of the direct ...


In [6]:
methods = pd.read_csv('methods.csv')
methods['method_matching'] = ' ' + methods.method_lower.str.replace('[{}]'.format(string.punctuation), ' ', regex = True) + ' '
m = methods[['method', 'method_lower', 'method_matching']].drop_duplicates('method_matching').reset_index(drop = True)
m.head()

Unnamed: 0,method,method_lower,method_matching
0,Abduction,abduction,abduction
1,A/b test,a/b test,a b test
2,Accelerated longitudinal design,accelerated longitudinal design,accelerated longitudinal design
3,Action research,action research,action research
4,Activity theory,activity theory,activity theory


# spaczz: FuzzyMatcher

In [9]:
# Set up matcher
nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab, min_r2=90)
for i, method in enumerate(m.method):
    matcher.add(method, [nlp(m['method_lower'][i])])

In [11]:
# Preprocess the abstracts (This takes ~ 1hour)
df['abstract_nlp'] = df.abstracts.progress_apply(lambda x: nlp(x) if pd.notnull(x) else x)

100%|████████████████████████████████████| 31142/31142 [00:53<00:00, 580.29it/s]


In [12]:
# Match the methods (This takes ~ 20hours!)
df['matches'] = df.abstract_nlp.progress_apply(lambda x: matcher(x) if pd.notnull(x) else x)

100%|██████████████████████████████████| 31142/31142 [19:21:07<00:00,  2.24s/it]


In [13]:
# Extract the matched methods from the spaczz matches objects
df['methods'] = df.matches.progress_apply(lambda x: [i[0] for i in x] if type(x) == list else x)
df['methods_tokens'] = df.progress_apply(lambda x: [x['abstract_nlp'][j[0]:j[1]] for j in [i[1:3] for i in x['matches']]] if type(x['matches']) == list else x['matches'], axis = 1)
df['match_ratios'] = df.matches.progress_apply(lambda x: [i[3] for i in x] if type(x) == list else x)

100%|█████████████████████████████████| 31142/31142 [00:00<00:00, 121430.94it/s]
100%|██████████████████████████████████| 31142/31142 [00:02<00:00, 11542.85it/s]
100%|█████████████████████████████████| 31142/31142 [00:00<00:00, 254706.34it/s]


In [14]:
# Save the data
df.to_csv('UoE_staff_publications_theses_spaczz_matches.csv')
df.to_json("UoE_staff_publications_theses_spaczz_matches.json", default_handler=str) # Saving as json handles the nested lists better

# fuzzysearch: find_near_matches

In [15]:
# %time
# index_list = []
# match_list = []
# my_string = methods.method_matching[0]
# for i, abstract in enumerate(tqdm(df.abstracts_matching)):
#     try:
#         match = find_near_matches(my_string, abstract, max_l_dist=1)
#         if match != []:
#             index_list.append(i)
#             match_list.append(match)
#     except:
#         pass
# #         match_list.append(None)

In [16]:
# index_list

In [17]:
# match_list

In [18]:
# %time
# df['match'] = df[df.abstracts_matching.isnull() == False].abstracts_matching.apply(lambda x: find_near_matches(my_string, x, max_l_dist=1))