Source: 

Gomez, Charles, 2022, "Replication Data for: Leading countries in global science increasingly receive more citations than other countries doing similar research.", https://doi.org/10.7910/DVN/WCOINR, Harvard Dataverse.

In [None]:
from Python_Class_LLDA import LLDA

import pandas as pd
from tqdm.notebook import tqdm

import re
import bz2
from ast import literal_eval
import itertools
import pickle

DIR = '/scratch/fl1092/followup-editors/race_citation_well/'

In [None]:
NLLDA_Min_Year = 2001 # 1980
NLLDA_Max_Year = 2017

In [None]:
def clean_ngrams(x):
    
    x = re.sub(r'[^\w\s]', '', x)
    
    if x.isdigit()==True:
        return ''
    x = x.lstrip().rstrip()
    if len(x)>1:
        if x[-1]=='s':
            x = x[:-1]
    return x

In [None]:
def loadCorpusLabel():
    
    paperRace = (
        pd.read_csv(DIR + 'PaperRace.csv', sep='\t')
        .rename(columns={'Race':'labels','PaperId':'paperid'})
    )

    paperYear = (
        pd.read_csv(DIR + 'PaperYear.csv', sep='\t')
        .rename(columns={'Year':'year','PaperId':'paperid'})
    )
    
    paperField = (
        pd.read_csv(DIR + 'PaperField.csv', sep='\t')
        .rename(columns={'PaperId':'paperid'})
    )
    
    corpus = (
        pd.read_csv(DIR + 'PaperAbstractProcessed.csv', sep='\t', converters={'RakeAbstract': literal_eval})
        .rename(columns={'RakeAbstract':'Abstract','PaperId':'paperid'})
        [["paperid","Abstract"]]
    )
    
    return paperRace, paperYear, paperField, corpus

In [None]:
def filterDiscipline(disp, paperField, paperRace, corpus, paperYear):
    
    subset = paperField.query("FieldOfStudyId == @disp")
    
    paperRace = paperRace[paperRace.paperid.isin(subset.paperid)]
    corpus = corpus[corpus.paperid.isin(subset.paperid)]    
    paperYear = paperYear[paperYear.paperid.isin(subset.paperid)]
    
    return paperRace, paperYear, corpus

In [None]:
def constructLLDA(Yearly_Dict_of_Corpora, Yearly_Dict_of_Labels, beta=0.1, alpha=0.1):
    
    NLLDA_Min_Year = 2001 #1980 
    NLLDA_Max_Year = 2017
    Dictionary_of_NLLDA = {}

    for year_ in range(NLLDA_Min_Year,NLLDA_Max_Year+1,1):
        
        if year_ not in set(Yearly_Dict_of_Labels.keys()).union(set(Yearly_Dict_of_Corpora.keys())):
            continue

        labels_list = [label_.split(" ") for label_ in Yearly_Dict_of_Labels[year_].values()]
        labels_set = list(set(list(itertools.chain.from_iterable(labels_list))))

        K = len(labels_set) # Number of labels

        NLLDA_Model_ = LLDA(K, alpha, beta)
        NLLDA_Model_.set_corpus(labels_set, Yearly_Dict_of_Corpora[year_].values(), labels_list)
        
        #for ite in range(100):
        NLLDA_Model_.inference()
        
        Dictionary_of_NLLDA[year_] = NLLDA_Model_
        
    return Dictionary_of_NLLDA

In [None]:
%%time
paperRaceDf, paperYear, paperField, corpusDf = loadCorpusLabel()

In [None]:
paperRaceDf.shape, paperYear.shape, paperField.shape, corpusDf.shape

In [None]:
paperField.head()

In [None]:
fields = paperField.FieldOfStudyId.unique()

In [None]:
%%time
for disp in tqdm(fields):

    paperRace, df_year_censored, corpus = filterDiscipline(disp, paperField, paperRaceDf, corpusDf, paperYear)
    print(disp, paperRace.shape, df_year_censored.shape, corpus.shape)

    corpus = corpus.set_index("paperid").to_dict()["Abstract"]

    df_labels = paperRace.groupby('paperid')["labels"].apply(lambda x: " ".join(x)).reset_index(name="Labels")
    labels_dict = pd.Series(df_labels.Labels.values,index=df_labels.paperid).to_dict()

    year_dict = pd.Series(df_year_censored.year.values, index=df_year_censored.paperid).to_dict()
    list_of_years = list(set(year_dict.values()))

    Yearly_Dict_of_Corpora = {years_:{} for years_ in list_of_years}
    Yearly_Dict_of_Labels = {years_:{} for years_ in list_of_years}

    for paperid_, yearid_ in year_dict.items():

        if paperid_ in corpus.keys() and paperid_ in labels_dict.keys():
            if corpus[paperid_]!=[] and labels_dict[paperid_]!="":
                Yearly_Dict_of_Corpora[yearid_].update({paperid_: corpus[paperid_]})
                Yearly_Dict_of_Labels[yearid_].update({paperid_: labels_dict[paperid_]})

    Yearly_Dict_of_Corpora = {
        years_: {
            paperid_: [
                clean_ngrams(str(term_)) for term_ in abstract_]
            for paperid_, abstract_ in year_corpora.items()
        } for years_,year_corpora in Yearly_Dict_of_Corpora.items()
    }

    Yearly_Dict_of_Corpora = {
        years_: {
            paperid_: [
                term_ for term_ in abstract_ if len(term_)>1]
            for paperid_, abstract_ in year_corpora.items()
        } for years_,year_corpora in Yearly_Dict_of_Corpora.items()
    }
    
    Yearly_NLLDA_Dict_Filename = DIR + f'llda_inf/{disp}.pbz2'

    LLDA_dict = constructLLDA(Yearly_Dict_of_Corpora, Yearly_Dict_of_Labels)
    
    with bz2.BZ2File(Yearly_NLLDA_Dict_Filename, 'w') as f:
        
        for year in range(NLLDA_Min_Year,NLLDA_Max_Year+1):
            try:
                pickle.dump(LLDA_dict[year], f, protocol=2)
            except Exception as e:
                print('ERROR', disp, year, e)
                pickle.dump({}, f, protocol=2)