In [None]:
import os
import numpy as np
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import pickle

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [2]:
drive.mount('/content/drive')
DRIVE_PATH = "/content/drive/MyDrive/REU evolution of scientific fields"

Mounted at /content/drive


# **Data Processing and Cleaning**

See Summary Statistics and Exploration notebook for discussion.

In [3]:
#Load full dataset
with open(os.path.join(DRIVE_PATH, 'fields_df.pkl'), 'rb') as file:
    fields_df = pickle.load(file)

In [4]:
#Keep only the first 200000 papers; later papers are less recent and have lower log probability
new_fields_df = fields_df[:200000]

#Drop papers that are missing abstracts/references
new_fields_df = new_fields_df.dropna(subset=['RId', 'IA']).reset_index(drop=True)

#Remove papers published prior to 1950
new_fields_df = new_fields_df[new_fields_df['Y'] >= 1950]

#Remove papers with abstracts that are less than 30 words or more than 500 words
new_fields_df = new_fields_df[new_fields_df["IA"].apply(lambda f: f.get('IndexLength')) >= 30]
new_fields_df = new_fields_df[new_fields_df["IA"].apply(lambda f: f.get('IndexLength')) <= 500]

len(new_fields_df) #Leaves 59969 papers

59969

In [5]:
#Obtain vector of abstracts
abstracts = []
for ia in new_fields_df.IA:
  temp = [""]*ia['IndexLength']
  for word, idxs in ia['InvertedIndex'].items():
    for idx in idxs:
      temp[idx] = word
  abstracts.append(" ".join(temp))

#Add (un-inverted) abstracts as column of dataframe
new_fields_df = new_fields_df.assign(A=abstracts)

In [6]:
#For simplicity, remove all duplicate papers
#(ie, those that have the same abstract as another paper)
duplicates = new_fields_df.duplicated(subset=['A'], keep=False)
new_fields_df = new_fields_df[~duplicates]

len(new_fields_df) #Leaves 59384 papers

59384

In [9]:
#Functions to lemmatize words in a group of text 
#https://gaurav5430.medium.com/using-nltk-for-lemmatizing-sentences-c1bfff963258

lemmatizer = WordNetLemmatizer()
# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

## Lemmatize abstracts and add to dataframe
new_fields_df["LA"] = [lemmatize_sentence(text) for text in new_fields_df.A]

In [None]:
#Save filtered dataframe
with open(os.path.join(DRIVE_PATH, 'filtered_new_fields_df.pkl'), 'wb') as file:
    pickle.dump(new_fields_df, file)