In [49]:
#%pip install datasets transformers
#%pip install -r requirements.txt
#%pip install stanfordcorenlp
#%pip install stemming
#%pip install nltk


In [50]:
# Data imports
import pandas as pd
from datasets import load_dataset

# Page Rank imports
import nltk
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet as wn
import networkx as nx

from position_rank import position_rank
from tokenizer import StanfordCoreNlpTokenizer


In [51]:
dataset = load_dataset("midas/duc2001", "raw")
data = dataset['test']

In [52]:
df = data.data.to_pandas()

print(df.info())
print('----')
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      308 non-null    object
 1   document                308 non-null    object
 2   doc_bio_tags            308 non-null    object
 3   extractive_keyphrases   308 non-null    object
 4   abstractive_keyphrases  308 non-null    object
 5   other_metadata          308 non-null    object
dtypes: object(6)
memory usage: 14.6+ KB
None
----
              id                                           document  \
0  AP881222-0089  [Here, ,, at, a, glance, ,, are, developments,...   
1  AP880331-0140  [Rumbling, spring, thunderstorms, have, announ...   
2  AP880629-0159  [Two, U.S., Air, Force, F-16, fighter, jets, c...   
3  AP881216-0017  [A, recommended, halt, to, the, government, 's...   
4  AP880801-0195  [A, split, ,, charred, tree, stump, is, a, clu...   

  

In [55]:
# POS tag words, find synset of word and create new list

synset_documents = [] # list of the list of synsets for each document
for doc in df['document']:
    doc_sets = [] # for this document, the list of synsets

    for word in doc:
        #print(word)
        pos_tag = nltk.pos_tag([word])[0][1]
        #doc_tags.append(pos_tag)
        #print(pos_tag)
        
        if pos_tag.__contains__('NN'):
            part = wn.NOUN
        elif pos_tag.__contains__('V'):
            part = wn.VERB  
        elif pos_tag.__contains__('JJ'):
            part = wn.ADJ
        elif pos_tag.__contains__('RB'):
            part = wn.ADV
        else:
            continue
        synset = wn.synsets(word, pos=part)
        #print(synset)
        
        if len(synset) > 0:
            doc_sets.append(synset[0]) # append the synset for this word

    synset_documents.append(doc_sets)
    

#df.insert(6, "POS", tags)
    

In [92]:
# Construct weighted graph of synsets

#print(one.path_similarity(two))

doc = 1

G = nx.Graph()

for i in range(len(synset_documents[doc])):
    for j in range(len(synset_documents[doc])):
        one = synset_documents[doc][i]
        two = synset_documents[doc][j]
        G.add_edge(one, two, weight=one.path_similarity(two))

print(G)

Graph with 178 nodes and 15931 edges


In [93]:
# Apply PageRank algorithm on the graph
rank = nx.pagerank(G)

key_words = list(rank.keys())[:10]

for i in range(len(key_words)):
    key_words[i] = key_words[i].name().split(".")[0]

print(key_words)

['rumble', 'spring', 'thunderstorm', 'have', 'announce', 'get_down', 'unofficial', 'tornado', 'season', 'run']


In [94]:
# Compare pagerank results to extractive keyphrases
answers = " ".join(df['extractive_keyphrases'][doc])

hit_rate = 0

for word in key_words:
    if word in answers:
        hit_rate += 1

precision = hit_rate / len(key_words)
recall = hit_rate / len(df['extractive_keyphrases'][doc])
f1 = 2/((1/precision) + (1/recall))

print(hit_rate)
print(precision)
print(recall)
print(f1)


4
0.4
0.5
0.4444444444444444


In [95]:
print(df['extractive_keyphrases'][doc])

['tornado season' 'spring thunderstorms' 'tornadoes' 'texas'


# Position Rank

title = "PositionRank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents"
abstract = """The large and growing amounts of online scholarly data present both challenges and
opportunities to enhance knowledge discovery. One such challenge is to automatically extract a small set of keyphrases
from a document that can accurately describe the document’s content and can facilitate fast information processing. In
this paper, we propose PositionRank, an
unsupervised model for keyphrase extraction from scholarly documents that incorporates information from all positions of a
word’s occurrences into a biased PageRank. Our model obtains remarkable improvements in performance over PageRank models that do not take into account
word positions as well as over strong baselines for this task. Specifically, on several
datasets of research papers, PositionRank
achieves improvements as high as 29.09%."""
document = df['document'][1]

tokenizer = StanfordCoreNlpTokenizer("http://localhost", port = 9000)
position_rank(title + " " + abstract, tokenizer)
