# This notebook supports calculating non-self citations based on [scholarly](https://github.com/scholarly-python-package/scholarly)

* You probably need a SCRAPER_API_KEY as google scholar limits the query traffic from each end user
* The scholarly API returns partial information of each publication in default and we need to call the function of 'fill' to add additional information necessary for citation calculation   

In [None]:
from scholarly import scholarly, ProxyGenerator
from tqdm import tqdm
import pandas as pd
import os
import json
from collections import defaultdict

In [None]:
# Optional but probably you'll need it
SCRAPER_API_KEY = ''
pg = ProxyGenerator()
success = pg.ScraperAPI(SCRAPER_API_KEY)
scholarly.use_proxy(pg)

In [None]:
BASE_DIR = '/Users/XXX/XXX/GreenCard/citations' # Path Saving Publication Information

In [None]:
DB_PUBS = defaultdict(dict) # All pubs (including my pubs and papers cited my pubs)
DB_PUBS_PATH = os.path.join(BASE_DIR, 'DB_PUBS.json')  

DB_MY_PUBS = defaultdict(dict) # My pubs
DB_MY_PUBS_PATH = os.path.join(BASE_DIR, 'DB_MY_PUBS.json') 

DB_CITE = defaultdict(dict) # The citation relationship between pubs (key: pub; value: papers cited this pub)
DB_CITE_PATH = os.path.join(BASE_DIR, 'DB_CITE.json') 

# Utilities

In [None]:
# get all pubs based on author_id
def get_all_pubs(author_id=''):
    me = scholarly.search_author_id(author_id)
    me = scholarly.fill(me)
    my_pubs = me['publications']
    my_pubs = [scholarly.fill(pub) for pub in tqdm(my_pubs)]
    return my_pubs


def save_pubs_to_db(pubs, db=DB_PUBS):
    for pub in pubs:
        title = pub['bib']['title']
        if title in DB_PUBS:
            continue
        db[title] = pub

        
def dump_db(db=DB_PUBS, path=DB_PUBS_PATH):
    with open(path, 'w') as f:
        f.write(json.dumps(db))


def dump_all_db():
    with open(DB_PUBS_PATH, 'w') as f:
        f.write(json.dumps(DB_PUBS))
    with open(DB_MY_PUBS_PATH, 'w') as f:
        f.write(json.dumps(DB_MY_PUBS))
    with open(DB_CITE_PATH, 'w') as f:
        f.write(json.dumps(DB_CITE))


def load_all_db():
    global DB_PUBS
    global DB_MY_PUBS
    global DB_CITE

    if os.path.exists(DB_PUBS_PATH):
        with open(DB_PUBS_PATH, 'r') as f:
            DB_PUBS = json.loads(f.read())
            
    if os.path.exists(DB_MY_PUBS_PATH):
        with open(DB_MY_PUBS_PATH, 'r') as f:
            DB_MY_PUBS = json.loads(f.read())

    if os.path.exists(DB_CITE_PATH):
        with open(DB_CITE_PATH, 'r') as f:
            DB_CITE = json.loads(f.read())


def fill_db(db):
    titles = list(db.keys())
    for t in tqdm(titles):
        pub = db[t]
        if not pub['filled']:
            db[t] = scholarly.fill(pub)


def get_authors(pub) -> list:
    authors = pub['bib']['author'].split(' and ')
    return [get_normalized_name(author) for author in authors]


def get_normalized_name(name):
    parts = [p.strip() for p in name.split(',')]
    name = ' '.join(reversed(parts))
    return name


def get_common_authors(authors1, authors2) -> list:
    common_authors = set(authors1) & set(authors2)
    return list(common_authors)


def get_publish_info(bib):
    if 'citation' in bib:
        return bib['citation']
    if 'journal' in bib:
        return bib['journal']
    if 'conference' in bib:
        return bib['conference']
    if 'booktitle' in bib:
        return bib['booktitle']
    if 'venue' in bib:
        return bib['venue']
    return 'NA'


def analyze_non_self_citation(pub):
    title = pub['bib']['title']
    filled_pub = DB_PUBS[title]
    citations = DB_CITE[title]
    citations_title = [c['bib']['title'] for c in citations]
    filled_citations = [DB_PUBS[t] for t in citations_title if t in DB_PUBS]

    count_valid_citations = 0
    df = pd.DataFrame({'title': [], 'authors':[], 'citation':[], 'year':[], 'self-citation':[], 'common_authors':[]})

    pub_authors = get_authors(filled_pub)
    for citation in tqdm(filled_citations):
        citation_authors = get_authors(citation)
        common_authors = get_common_authors(pub_authors, citation_authors)
        self_citation = 0 if len(common_authors) == 0 else 1
        count_valid_citations += 0 if self_citation else 1
        df.loc[len(df.index)] = [citation['bib']['title'],
                                 ';'.join(citation_authors),
                                 get_publish_info(citation['bib']),
                                 citation['bib']['pub_year'],
                                 self_citation,
                                 ','.join(common_authors)]

    # df.sort_values('year', ascending=False, inplace=True)
    return count_valid_citations, df


# Download Publication and Citation Data

In [None]:
load_all_db()
print(len(DB_MY_PUBS))

In [None]:
# 1. Get all my pubs and save to DB
gscholar_id = ''
if len(DB_MY_PUBS.keys()) > 0:
    print('Load Data')
    my_pubs = list(DB_MY_PUBS.values())
else:
    print('Empty DB -> Create My Pubs)')
    my_pubs = get_all_pubs(gscholar_id)
    save_pubs_to_db(my_pubs, DB_PUBS)
    save_pubs_to_db(my_pubs, DB_MY_PUBS)
    dump_all_db()

In [None]:
# 2. Get all citations and save citations to DB
for pub in tqdm(my_pubs):
    # print(pub)
    if 'citedby_url' in pub:
        citations = list(scholarly.citedby(pub))
        save_pubs_to_db(citations)
        title = pub['bib']['title']
        DB_CITE[title] = citations

dump_all_db()

In [None]:
# 3. Fill all papers with details
DB_PUBS_filled = [1 if pub['filled'] else 0 for pub in DB_PUBS.values()]
print('Currently, %d papers are filled among %d all papers' % (sum(DB_PUBS_filled), len(DB_PUBS)))

fill_db(DB_PUBS)
dump_db(DB_PUBS, DB_PUBS_PATH)

# Compute Non-Self Citation

In [None]:
cnt_sum = 0
for i in range(len(my_pubs)):
    cnt_cite, df = analyze_non_self_citation(my_pubs[i])
    cnt_sum += cnt_cite

In [None]:
print('Total Number of Non-Self Citations :',cnt_sum)