In [None]:
import glob
import sys
import imp
sys.path.append('/home/fl1092/editor-followup-analysis/src')
import loader
import utils
import os

from tqdm.notebook import tqdm
import pandas as pd

DIR = '/scratch/fl1092/followup-editors/race_citation_well/'
MAGDIR = '/scratch/fl1092/MAG/2021-12-06/'
REPDIR = '/scratch/fl1092/leading_country_replication/'

In [None]:
%%time
info = (
    pd.read_csv('/scratch/fl1092/followup-editors/PaperInfoGathered.csv', sep='\t',
                usecols=['PaperId','Publisher','Journal'])
    .query('Journal == "PNAS"')
)
print(info.shape)

In [None]:
%%time
paperCountry = loader.loadPaperCountry().query('iso == "US"').drop(['iso','CountryCount','Count','Percentage'],axis=1)
print(paperCountry.shape)

In [None]:
paperCountry.head()

In [None]:
papers = info[['PaperId']].drop_duplicates().merge(paperCountry, on='PaperId')
print(papers.shape)

In [None]:
Original: use all papers in MAG that are US-based
%%time
papers = (
    pd.read_csv('/scratch/fl1092/followup-editors/PaperCountryAll.csv', sep='\t',
                usecols=['PaperId','iso','Percentage'])
    .query('iso == "US"')
    .drop('Percentage', axis=1)
)

print(papers.shape)

# Paper race

In [None]:
%%time
race = loader.loadRace()

In [None]:
%%time
papAu = loader.loadPaperAuthor()

In [None]:
%%time
paperRace = (
    papers.merge(papAu, on=["PaperId"])
    .merge(race, on='AuthorId')
    .drop(['AuthorId','RaceScore'], axis=1) # iso
    .drop_duplicates()
)
print(paperRace.shape)

In [None]:
paperRace.PaperId.nunique()

In [None]:
paperRace.to_csv(DIR + 'PaperRace.csv', sep='\t', index=False)

# Field

In [None]:
%%time
fields = pd.read_csv(MAGDIR + "advanced/FieldsOfStudy.txt", sep="\t",
                     usecols=['FieldOfStudyId','Level'],
                        names = ["FieldOfStudyId", "Rank", "NormalizedName", "DisplayName", 
                                 "MainType","Level","PaperCount","PaperFamilyCount","CitationCount","CreatedDate"])
print(fields.shape)

In [None]:
assert(fields.query('Level == 1').FieldOfStudyId.duplicated().any()==False)
fields.query('Level == 1').shape

In [None]:
%%time
paper_field = pd.read_csv(MAGDIR + "advanced/PaperFieldsOfStudy.txt", sep='\t',
                         names=['PaperId', "FieldOfStudyId", 'Score'],
                         usecols=['PaperId', "FieldOfStudyId"],
                         dtype={'PaperId':int, "FieldOfStudyId": int, "Score":float})
print(paper_field.shape)

In [None]:
%%time
paperField = (
    papers.merge(paper_field, on='PaperId')
    .merge(fields.query('Level == 1'), on='FieldOfStudyId')
    .drop('Level', axis=1)
)
print(paperField.shape, paperField.PaperId.nunique())

In [None]:
# paperField = paperField.drop('iso', axis=1)
# paperField.shape

In [None]:
paperField.to_csv(DIR + 'PaperField.csv', sep='\t', index=False)

# Paper year

In [None]:
%%time
paper_year = pd.read_csv(MAGDIR+'derived/PaperYear.csv',sep='\t')
print(paper_year.shape) # 269806629 # 262235509

In [None]:
%%time
paperYear = (
    paper_year.merge(papers, on='PaperId') # .drop('iso', axis=1)
)
print(paperYear.shape) # 172062

In [None]:
paperYear.Year.min()

In [None]:
paperYear.to_csv(DIR + 'PaperYear.csv', sep='\t', index=False)

# Corpus

In [None]:
%%time
paper_abstract = pd.concat([
    pd.read_csv(file, sep='\t', names=['PaperId', 'IndexedAbstract']).merge(papers, on='PaperId')# .drop('iso',axis=1)
    for file in tqdm(glob.glob('/scratch/fl1092/MAG_July_2021/nlp/PaperAbstractsInvertedIndex.*'))])

In [None]:
paper_abstract.shape

In [None]:
paper_abstract.to_csv(DIR + 'RawPaperAbstract.csv', sep='\t', index=False)

### Clean abstract

In [None]:
import json
import string

def create_text_and_langauge(indexedabstract_input):
    
    inverted_index = json.loads(indexedabstract_input)["InvertedIndex"]	
    index = {
        k: str(oldk).lstrip().rstrip().lower().translate(str.maketrans('', '', string.punctuation))
        for oldk, oldv in inverted_index.items()
        for k in oldv
    }

    if len(index.values())<30:
        return ""

    abstract_text = ""
    for i in range(0,len(index)):
        try:
            abstract_text += index[i] + " "
        except:
            next

    return abstract_text

In [None]:
%%time
df_abstract = (
    paper_abstract.assign(Abstract = lambda df: df.IndexedAbstract.apply(lambda x: create_text_and_langauge(x)))
    .drop('IndexedAbstract', axis=1)
)

In [None]:
df_abstract.head()

### Detect language

In [None]:
#### my customary english detector ####
def detect(s):
    # if it contains at least one english letter than it is considered english
    
    for c in s:
        if ord(c) >=65 and ord(c) <=122:
            return True
    
    return False

assert(detect('abc') == True)
assert(detect('abc123') == True)

In [None]:
%%time
df_abstract = (
    df_abstract.assign(Is_English = lambda df: df.Abstract.apply(detect))
)

In [None]:
df_abstract[df_abstract.Is_English == False].sample(5)

### Remove stop words and Rake

In [None]:
import itertools
from nltk.corpus import stopwords

In [None]:
stopword_list_extended = pd.read_csv(REPDIR + "INPUT_Stopword_List.csv",header=None).values.tolist()
stopword_list_extended = list(itertools.chain(*stopword_list_extended))

stopword_list = stopwords.words('english') + list(string.punctuation)
stopword_list.append('')
stopword_list.append('this')
stopword_list.append('come')
stopword_list.append('make')
stopword_list.append('among')
stopword_list.append('toward')
stopword_list.append('put')
stopword_list.append('use')
stopword_list.append('during')
stopword_list.append('since')
stopword_list.append('from')
stopword_list.append('with')
stopword_list.append('article')
stopword_list.append('has')
stopword_list.append('find')
stopword_list.append('argue')
stopword_list.append('also')
stopword_list.append('elsevi')
stopword_list.append('ltd')
stopword_list.append('use')
stopword_list.append('abstract')
stopword_list.append('paper')
stopword_list.append('article')
stopword_list.append('describe')
stopword_list.append('described')
stopword_list.extend(stopword_list_extended)
stopword_list = list(set(stopword_list))

# Source | Academic Word List Coxhead (2000) and list of action verbs. 
# http://www.uefap.com/vocab/select/awl.htm
academic_stopwords = pd.read_csv(REPDIR + "INPUT_R_Academic_Stopwords.csv")["ACADEMIC_STOP_WORDS"].values.tolist()
academic_stopwords = [stop_.lstrip().rstrip() for stop_ in academic_stopwords]
stopword_list.extend(academic_stopwords)
stopword_list = list(set(stopword_list))

In [None]:
from rake_nltk import Rake

def translate_English(x):
    r = Rake(min_length=1,max_length=3,stopwords=stopword_list,punctuations=string.punctuation)
    # Uses stopwords for english from NLTK, and all puntuation characters.
    r.extract_keywords_from_text(x)
    return r.get_ranked_phrases()

In [None]:
%%time
df_abstract_en = (
    df_abstract.query('Is_English == True')
    .assign(RakeAbstract = lambda df: df.Abstract.apply(translate_English))
)

In [None]:
df_abstract_en.head()

In [None]:
df_abstract_en.shape

In [None]:
df_abstract_en.to_csv(DIR + 'PaperAbstractProcessed.csv', sep='\t', index=False, columns=['PaperId','RakeAbstract'])