Go to [KAUST Repository](https://repository.kaust.edu.sa) and make sure to login if the author had more than 50 publications. Otherwise, you will not be able to download all the publications unless you manually select them with the checkboxes. Once logged in, go to [this page](https://repository.kaust.edu.sa/search-filter?field=author) and select an author. You will be directed to the author search page. Click on export and select the desired format, make sure that the number of publications matches the total number of search results. For Bernard Ghanem, [this](https://repository.kaust.edu.sa/discover/export?format=csv&view=list&rpp=10&etal=0&group_by=none&page=1&filtertype_0=author&filter_relational_operator_0=authority&filter_0=975843ef3274b4392b6a3b5f8d4beb62) is the download link.

In [2]:
KAUST_REPO_BIB_FILE = 'Bernard_2019_12_24.bib'
TITLE_SIMILARITY_THRESHOLD = 0.95  # [0, 1)
CAREFUL = False  # ask if the match is not perfect

In [3]:
import json
from difflib import SequenceMatcher

from publib import Publication, PUBLIB as publib


def similarity(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def parse_formated_bib(bibfile):
    # only every two entries must be separated by '\n}\n'
    # every field must be one liner and tightly surrounded by {}
    # the authors must be like 'Last, first' separated by ' and '
    out = []
    with open(bibfile, 'r') as f:
        for entry in f.read().split('\n}\n'):
            out.append({})
            lines = [l.split('=') for l in entry.split('\n')]
            for line in lines:
                if len(line) != 2:
                    continue
                key = line[0].strip()
                start = line[1].index('{') + 1
                end = line[1].rindex('}')
                value = line[1][start:end]
                if key == 'year':
                    value = int(value)
                elif key == 'author':
                    value = [' '.join(reversed(a.split(', ')))
                            for a in value.split(' and ')]
                out[-1][key] = value
    return out

count = 0
biblib = parse_formated_bib(KAUST_REPO_BIB_FILE)
for p in publib:
    sim = [similarity(b['title'], p.title) for b in biblib]
    arg = max(enumerate(sim), key=lambda x: x[1])[0]
    if sim[arg] > TITLE_SIMILARITY_THRESHOLD:
        b = biblib[arg]
        if CAREFUL and sim[arg] != 1:
            print('(publib)', p.title)
            print('(biblib)', b['title'])
            while True:
                answer = input('match? (y/n): ')
                if answer in 'yn':
                    break
            if answer == 'n':
                print('skipped')
                continue
        p.abstract = b.get('note', p.abstract)
        p.handle = b['url'][b['url'].rfind('t/') + 2:]
        count += 1
count

30

In [6]:
authors = list(Publication.AUTHORS.keys())
lib = {
    'venue': list(Publication.VENUES.keys()),
    'full_venue': list(Publication.VENUES.values()),
    'author': list(Publication.AUTHORS.values()),
    'coauthor': [''] + list(Publication.COAUTHORS),
    'distinction': list(Publication.DISTINCTIONS),
    'link': list(Publication.LINKS),
    'keys': ['handle', 'theme', 'year', 'venue', 'thumbnail', 'paper',
             'title', 'authors', 'coauthors', 'distinctions', 'links',
             'abstract'],
}

def get(p, key, lib=lib, authors=authors):
    if key in {'theme', 'thumbnail', 'paper', 'title', 'abstract'}:
        return getattr(p, key)
    elif key == 'handle':
        return p.handle if p.handle else 0
    elif key == 'year':
        return p.year - 2000
    elif key == 'venue':
        return lib['venue'].index(p.venue)
    elif key == 'authors':
        return [authors.index(a) for s in p.authors for a in s]
    elif key == 'coauthors':
        count = 0
        coauthors = []
        for a in p.authors:
            if len(a) > 1:
                count += 1
                coauthors.append(count)
            else:
                coauthors.append(0)
        return [c for s, c in zip(p.authors, coauthors) for a in s]
    elif key == 'distinctions':
        return [lib['distinction'].index(d) for d in p.distinctions]
    elif key == 'links':
        return [p.links[l] if l in p.links else 0 for l in lib['link']]
    else:
        raise ValueError(f'unknown key: {key}')

lib['lib'] = [[get(p, k) for k in lib['keys']] for p in publib]
with open('publib.js', 'w') as f:
    f.write('const publib = ')
    f.write(repr(lib))
    f.write(';')