In [1]:
import numpy as np
import pandas as pd
import re
import string
import operator
import six
from six.moves import range
import math


import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
stopwords = stopwords.words('english')
nltk.download('wordnet')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()

def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    #word = stemmer.stem(word)
    #word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted


def get_terms(tree):
    for leaf in leaves(tree):
        term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
        yield term

In [3]:
text = """Artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals."""

In [4]:
text.split(' ')

['Artificial',
 'intelligence',
 '(AI),',
 'sometimes',
 'called',
 'machine',
 'intelligence,',
 'is',
 'intelligence',
 'demonstrated',
 'by',
 'machines,',
 'unlike',
 'the',
 'natural',
 'intelligence',
 'displayed',
 'by',
 'humans',
 'and',
 'animals.']

In [5]:
# Used when tokenizing words
sentence_re = r'''(?x)      # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

#Taken from Su Nam Kim Paper
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""

toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)
chunker = nltk.RegexpParser(grammar)
tree = chunker.parse(postoks)
terms = get_terms(tree)

In [6]:
words = []
for term in terms:
  term1 = ' '.join([str(elem) for elem in term])
  words.append(term1)

In [7]:
words

['artificial intelligence',
 'ai',
 'machine intelligence',
 'intelligence',
 'machines',
 'natural intelligence',
 'humans',
 'animals']

In [8]:
word_embed = []
for term in terms:
  term1 = ' '.join([str(elem) for elem in term])
  print(term1)
  word_embed.append(get_w2v(term1,model))

In [None]:
# embed_dict = dict({})

# for key,term in enumerate(terms):
#     term1 = ' '.join([str(elem) for elem in term])
#     embed_dict[key] = get_w2v(term1,model)

In [None]:
#embed_dict

## GloVe Embeddings

In [9]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2020-11-16 15:19:03--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2020-11-16 15:19:03--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2020-11-16 15:19:04--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [10]:
!unzip glove*.zip

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [11]:
def load_glove_model(glove_file):
  model = {}
  f = open(glove_file)
  for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    model[word] = coefs
  return model
  

In [12]:
model = load_glove_model('/content/glove.840B.300d.txt')

In [13]:
def get_w2v(sentence, model):
    """
    :param sentence: inputs a single sentences whose word embedding is to be extracted.
    :param model: inputs glove model.
    :return: returns numpy array containing word embedding of all words    in input sentence.
    """
    return np.array([model.get(val, np.zeros(100)) for val in sentence.split()], dtype=np.float64)

In [14]:
ai_kw = get_w2v("artificial intelligence", model)

In [15]:
ai_kw.shape

(2, 300)

In [16]:
for word in words:
  result = [i+1 for i,w in enumerate(text.split()) if w.lower() == word]
  print(word,result)  

artificial intelligence []
ai []
machine intelligence []
intelligence [2, 9, 16]
machines []
natural intelligence []
humans [19]
animals []


### TF-IDF

In [17]:
text ="""Keyphrases are capable of providing semantic metadata characterizing documents and producing an overview of the content of a document. Since keyphrase extraction is able to facilitate the management, categorization, and retrieval of information, it has received much attention in recent years. There are three approaches to address keyphrase extraction: (i) traditional two-step ranking method, (ii) sequence labeling and (iii) generation using neural networks. Two-step ranking approach is based on feature engineering, which is labor intensive and domain dependent. Sequence labeling is not able to tackle overlapping phrases. Generation methods (i.e., Sequence-to-sequence neural network models) overcome those shortcomings, so they have been widely studied and gain state-of-the-art performance. However, generation methods can not utilize context information effectively. In this paper, we propose a novelty Span Keyphrase Extraction model that extracts span-based feature representation of keyphrase directly from all the content tokens. In this way, our model obtains representation for each keyphrase and further learns to capture the interaction between keyphrases in one document to get better ranking results. In addition, with the help of tokens, our model is able to extract overlapped keyphrases. Experimental results on the benchmark datasets show that our proposed model outperforms the existing methods by a large margin"""

In [18]:
keywords = re.findall(r'[a-zA-Z]\w+',text)
len(keywords)   

204

In [19]:
stopwords.extend(['there'])

In [20]:
keywords = [word.lower() for word in keywords if not word in stopwords]

In [21]:
len(keywords)

137

In [22]:
df = pd.DataFrame(list(set(keywords)),columns=['keywords'])

In [23]:
def weightage(word,text,number_of_documents=1):
  try:
    word_list = re.findall(word,text)
    number_of_times_word_appeared =len(word_list)
    tf = number_of_times_word_appeared/float(len(text))
    idf = np.log((number_of_documents)/float(number_of_times_word_appeared))
    tf_idf = tf*idf
    return number_of_times_word_appeared,tf,idf ,tf_idf
  except ZeroDivisionError:
    return '0000'

In [24]:
df['number_of_times_word_appeared'] = df['keywords'].apply(lambda x: weightage(x,text)[0])
df['tf'] = df['keywords'].apply(lambda x: weightage(x,text)[1])
df['idf'] = df['keywords'].apply(lambda x: weightage(x,text)[2])
df['tf_idf'] = df['keywords'].apply(lambda x: weightage(x,text)[3])

In [25]:
df.tf = df.tf.astype(int)
df.idf = df.idf.astype(int)
df.tf_idf = df.tf_idf.astype(int)

In [26]:
df = df.sort_values('tf_idf',ascending=True)

In [27]:
df

Unnamed: 0,keywords,number_of_times_word_appeared,tf,idf,tf_idf
0,show,1,0,0,0
72,two,3,0,-1,0
71,phrases,4,0,-1,0
70,producing,1,0,0,0
69,there,0,0,0,0
...,...,...,...,...,...
28,representation,2,0,0,0
27,effectively,1,0,0,0
26,tokens,2,0,0,0
36,proposed,1,0,0,0


In [28]:
from gensim.summarization import keywords

In [29]:
text = lemmatizer.lemmatize(text)

In [30]:
values = keywords(text=text,split='\n',scores=True)

In [31]:
values

[('models', 0.3001112501765992),
 ('model', 0.3001112501765992),
 ('keyphrases', 0.27404194456014663),
 ('ranking', 0.27283931746305884),
 ('keyphrase extraction', 0.22615620697286487),
 ('overlapping', 0.1867938208832368),
 ('span', 0.1866519651190099),
 ('extract overlapped', 0.18253214513440996),
 ('sequence', 0.18057995267706373),
 ('extracts', 0.17827046938558314),
 ('method', 0.16435153680784498),
 ('methods', 0.16435153680784498),
 ('generation', 0.1619082838963352),
 ('feature', 0.16104941841724807),
 ('context', 0.15210999120751884),
 ('information', 0.15210999120751875)]

In [32]:
data = pd.DataFrame(values,columns=['keyword','score'])
data = data.sort_values('score',ascending=False)

In [33]:
data

Unnamed: 0,keyword,score
0,models,0.300111
1,model,0.300111
2,keyphrases,0.274042
3,ranking,0.272839
4,keyphrase extraction,0.226156
5,overlapping,0.186794
6,span,0.186652
7,extract overlapped,0.182532
8,sequence,0.18058
9,extracts,0.17827


## Keyphrases based on Frequency

In [34]:
# Required functions for RAKE
def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False


def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words


def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words


def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
    sentences = sentence_delimiters.split(text)
    return sentences


def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = '\\b' + word + '\\b'
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern


def generate_candidate_keywords(sentence_list, stopword_pattern, min_char_length=1, max_words_length=5):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length):
                phrase_list.append(phrase)
    return phrase_list


def is_acceptable(phrase, min_char_length, max_words_length):

    # a phrase must have a min length in characters
    if len(phrase) < min_char_length:
        return 0

    # a phrase must have a max number of words
    words = phrase.split()
    if len(words) > max_words_length:
        return 0

    digits = 0
    alpha = 0
    for i in range(0, len(phrase)):
        if phrase[i].isdigit():
            digits += 1
        elif phrase[i].isalpha():
            alpha += 1

    # a phrase must have at least one alpha character
    if alpha == 0:
        return 0

    # a phrase must have more alpha than digits characters
    if digits > alpha:
        return 0
    return 1


def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        # if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  # orig.
            # word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]

    # Calculate Word scores = deg(w)/freq(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
    # word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score


def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=2):
    keyword_candidates = {}

    for phrase in phrase_list:
        if min_keyword_frequency > 2:
            if phrase_list.count(phrase) < min_keyword_frequency:
                continue
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates


In [35]:
def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = '\\b' + word + '\\b'
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern

In [36]:
stopword_pattern = build_stop_word_regex('/content/drive/My Drive/Colab Notebooks/Advanced NLP-Project/Project/sklearn_stopwords.txt')

In [37]:
def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words

In [38]:
def generate_candidate_keywords(sentence_list, stopword_pattern, min_char_length=1, max_words_length=5):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length):
                phrase_list.append(phrase)
    return phrase_list

In [39]:
text = """A challenging problem faced by researchers and developers
of distributed real-time and embedded (DRE) systems is 
devising and implementing effective adaptive resource 
management strategies that can meet end-to-end quality of service
(QoS) requirements in varying operational conditions. This
paper presents two contributions to research in adaptive 
resource management for DRE systems. First, we describe the
structure and functionality of the Hybrid Adaptive 
Resourcemanagement Middleware (HyARM), which provides 
adaptive resource management using hybrid control techniques
for adapting to workload fluctuations and resource 
availability. Second, we evaluate the adaptive behavior of HyARM
via experiments on a DRE multimedia system that distributes
video in real-time. Our results indicate that HyARM yields
predictable, stable, and high system performance, even in the
face of fluctuating workload and resource availability"""

In [40]:
word_list = separate_words(text,3)

In [41]:
phraseList = generate_candidate_keywords(sentence_list=word_list,stopword_pattern=stopword_pattern)

In [42]:
phraseList

['challenging',
 'problem',
 'faced',
 'researchers',
 'developers',
 'distributed',
 'real-time',
 'embedded',
 'systems',
 'devising',
 'implementing',
 'effective',
 'adaptive',
 'resource',
 'management',
 'strategies',
 'meet',
 'end-',
 '-end',
 'quality',
 'service',
 'requirements',
 'varying',
 'operational',
 'conditions',
 'paper',
 'presents',
 'contributions',
 'research',
 'adaptive',
 'resource',
 'management',
 'systems',
 'structure',
 'functionality',
 'hybrid',
 'adaptive',
 'resourcemanagement',
 'middleware',
 'hyarm',
 'provides',
 'adaptive',
 'resource',
 'management',
 'using',
 'hybrid',
 'control',
 'techniques',
 'adapting',
 'workload',
 'fluctuations',
 'resource',
 'availability',
 'second',
 'evaluate',
 'adaptive',
 'behavior',
 'hyarm',
 'experiments',
 'multimedia',
 'distributes',
 'video',
 'real-time',
 'results',
 'indicate',
 'hyarm',
 'yields',
 'predictable',
 'stable',
 'high',
 'performance',
 'face',
 'fluctuating',
 'workload',
 'resource',

In [43]:
word_score = calculate_word_scores(phraseList=phraseList)

In [44]:
generate_candidate_keyword_scores(phrase_list=phraseList, word_score=word_score)

{'-end': 1.0,
 'adapting': 1.0,
 'adaptive': 1.0,
 'availability': 1.0,
 'behavior': 1.0,
 'challenging': 1.0,
 'conditions': 1.0,
 'contributions': 1.0,
 'control': 1.0,
 'developers': 1.0,
 'devising': 1.0,
 'distributed': 1.0,
 'distributes': 1.0,
 'effective': 1.0,
 'embedded': 1.0,
 'end-': 1.0,
 'evaluate': 1.0,
 'experiments': 1.0,
 'face': 1.0,
 'faced': 1.0,
 'fluctuating': 1.0,
 'fluctuations': 1.0,
 'functionality': 1.0,
 'high': 1.0,
 'hyarm': 1.0,
 'hybrid': 1.0,
 'implementing': 1.0,
 'indicate': 1.0,
 'management': 1.0,
 'meet': 1.0,
 'middleware': 1.0,
 'multimedia': 1.0,
 'operational': 1.0,
 'paper': 1.0,
 'performance': 1.0,
 'predictable': 1.0,
 'presents': 1.0,
 'problem': 1.0,
 'provides': 1.0,
 'quality': 1.0,
 'real-time': 1.0,
 'requirements': 1.0,
 'research': 1.0,
 'researchers': 1.0,
 'resource': 1.0,
 'resourcemanagement': 1.0,
 'results': 1.0,
 'second': 1.0,
 'service': 1.0,
 'stable': 1.0,
 'strategies': 1.0,
 'structure': 1.0,
 'systems': 1.0,
 'techniq

In [45]:
def get_w2v(sentence, model):
  feature_vec = np.array([model.get(val, np.zeros(300)) for val in sentence.split()], dtype=np.float64)
  return feature_vec

In [46]:
get_w2v("artificial",model)

array([[-0.66058999,  0.2348    , -0.021227  , -0.32736999, -0.062493  ,
         0.27303001,  0.1733    ,  0.70947999,  0.21335   ,  1.48800004,
        -0.34101999,  0.1902    , -0.37272999,  0.33818001,  0.052431  ,
        -0.17522   , -0.30445999,  1.74199998,  0.068747  , -0.42475   ,
         0.041578  ,  0.42822999, -0.098792  ,  0.35168999, -0.34538999,
         0.04463   ,  0.078867  , -0.084303  ,  0.29864001, -0.46726999,
        -0.24345   ,  0.12309   , -0.025794  , -0.48201001,  0.81326002,
        -0.18169001,  0.44051   , -0.14026999,  0.3556    , -0.12732001,
         0.20464   , -0.019575  , -0.0212    ,  0.42548001,  0.33680999,
        -0.12704   , -0.32006001, -0.24986   ,  0.22744   ,  0.026694  ,
        -0.19501001, -0.055653  , -0.47916999,  0.20393001,  0.037399  ,
        -0.13944   , -0.12375   ,  0.35220999, -0.37195   , -0.42001   ,
        -0.15995   , -0.34676999,  0.13706   ,  0.23802   ,  0.22725999,
         0.37389001, -0.021143  , -0.75593001, -0.8

## TextRank

In [47]:
def clean(text):
    text = text.lower()
    printable = set(string.printable)
    text = filter(lambda x: x in printable, text)
    text = "".join(list(text))
    return text

Cleaned_text = clean(text)
# print(Cleaned_text)
text = word_tokenize(Cleaned_text)

print ("Tokenized Text: \n")
print (text)

Tokenized Text: 

['a', 'challenging', 'problem', 'faced', 'by', 'researchers', 'and', 'developers', 'of', 'distributed', 'real-time', 'and', 'embedded', '(', 'dre', ')', 'systems', 'is', 'devising', 'and', 'implementing', 'effective', 'adaptive', 'resource', 'management', 'strategies', 'that', 'can', 'meet', 'end-to-end', 'quality', 'of', 'service', '(', 'qos', ')', 'requirements', 'in', 'varying', 'operational', 'conditions', '.', 'this', 'paper', 'presents', 'two', 'contributions', 'to', 'research', 'in', 'adaptive', 'resource', 'management', 'for', 'dre', 'systems', '.', 'first', ',', 'we', 'describe', 'the', 'structure', 'and', 'functionality', 'of', 'the', 'hybrid', 'adaptive', 'resourcemanagement', 'middleware', '(', 'hyarm', ')', ',', 'which', 'provides', 'adaptive', 'resource', 'management', 'using', 'hybrid', 'control', 'techniques', 'for', 'adapting', 'to', 'workload', 'fluctuations', 'and', 'resource', 'availability', '.', 'second', ',', 'we', 'evaluate', 'the', 'adaptive',

In [48]:
pos_tag = nltk.pos_tag(text)

print ("Tokenized Text with POS tags: \n")
print (pos_tag)

Tokenized Text with POS tags: 

[('a', 'DT'), ('challenging', 'NN'), ('problem', 'NN'), ('faced', 'VBN'), ('by', 'IN'), ('researchers', 'NNS'), ('and', 'CC'), ('developers', 'NNS'), ('of', 'IN'), ('distributed', 'VBN'), ('real-time', 'NN'), ('and', 'CC'), ('embedded', 'VBD'), ('(', '('), ('dre', 'NN'), (')', ')'), ('systems', 'NNS'), ('is', 'VBZ'), ('devising', 'VBG'), ('and', 'CC'), ('implementing', 'VBG'), ('effective', 'JJ'), ('adaptive', 'JJ'), ('resource', 'NN'), ('management', 'NN'), ('strategies', 'NNS'), ('that', 'WDT'), ('can', 'MD'), ('meet', 'VB'), ('end-to-end', 'JJ'), ('quality', 'NN'), ('of', 'IN'), ('service', 'NN'), ('(', '('), ('qos', 'NN'), (')', ')'), ('requirements', 'NNS'), ('in', 'IN'), ('varying', 'VBG'), ('operational', 'JJ'), ('conditions', 'NNS'), ('.', '.'), ('this', 'DT'), ('paper', 'NN'), ('presents', 'VBZ'), ('two', 'CD'), ('contributions', 'NNS'), ('to', 'TO'), ('research', 'NN'), ('in', 'IN'), ('adaptive', 'JJ'), ('resource', 'NN'), ('management', 'NN'),

In [49]:


wordnet_lemmatizer = WordNetLemmatizer()

adjective_tags = ['JJ','JJR','JJS']

lemmatized_text = []

for word in pos_tag:
    if word[1] in adjective_tags:
        lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
    else:
        lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun
        
print ("Text tokens after lemmatization of adjectives and nouns: \n")
print (lemmatized_text)

Text tokens after lemmatization of adjectives and nouns: 

['a', 'challenging', 'problem', 'faced', 'by', 'researcher', 'and', 'developer', 'of', 'distributed', 'real-time', 'and', 'embedded', '(', 'dre', ')', 'system', 'is', 'devising', 'and', 'implementing', 'effective', 'adaptive', 'resource', 'management', 'strategy', 'that', 'can', 'meet', 'end-to-end', 'quality', 'of', 'service', '(', 'qos', ')', 'requirement', 'in', 'varying', 'operational', 'condition', '.', 'this', 'paper', 'present', 'two', 'contribution', 'to', 'research', 'in', 'adaptive', 'resource', 'management', 'for', 'dre', 'system', '.', 'first', ',', 'we', 'describe', 'the', 'structure', 'and', 'functionality', 'of', 'the', 'hybrid', 'adaptive', 'resourcemanagement', 'middleware', '(', 'hyarm', ')', ',', 'which', 'provides', 'adaptive', 'resource', 'management', 'using', 'hybrid', 'control', 'technique', 'for', 'adapting', 'to', 'workload', 'fluctuation', 'and', 'resource', 'availability', '.', 'second', ',', 'we', '

In [50]:
pos_tag = nltk.pos_tag(lemmatized_text)

print ("Lemmatized text with POS tags: \n")
print (pos_tag)

Lemmatized text with POS tags: 

[('a', 'DT'), ('challenging', 'NN'), ('problem', 'NN'), ('faced', 'VBN'), ('by', 'IN'), ('researcher', 'NN'), ('and', 'CC'), ('developer', 'NN'), ('of', 'IN'), ('distributed', 'VBN'), ('real-time', 'NN'), ('and', 'CC'), ('embedded', 'VBD'), ('(', '('), ('dre', 'NN'), (')', ')'), ('system', 'NN'), ('is', 'VBZ'), ('devising', 'VBG'), ('and', 'CC'), ('implementing', 'VBG'), ('effective', 'JJ'), ('adaptive', 'JJ'), ('resource', 'NN'), ('management', 'NN'), ('strategy', 'NN'), ('that', 'WDT'), ('can', 'MD'), ('meet', 'VB'), ('end-to-end', 'JJ'), ('quality', 'NN'), ('of', 'IN'), ('service', 'NN'), ('(', '('), ('qos', 'NN'), (')', ')'), ('requirement', 'NN'), ('in', 'IN'), ('varying', 'VBG'), ('operational', 'JJ'), ('condition', 'NN'), ('.', '.'), ('this', 'DT'), ('paper', 'NN'), ('present', 'JJ'), ('two', 'CD'), ('contribution', 'NN'), ('to', 'TO'), ('research', 'NN'), ('in', 'IN'), ('adaptive', 'JJ'), ('resource', 'NN'), ('management', 'NN'), ('for', 'IN'), 

In [51]:
stopwords = []

wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 

for word in pos_tag:
    if word[1] not in wanted_POS:
        stopwords.append(word[0])

punctuations = list(str(string.punctuation))

stopwords = stopwords + punctuations

In [52]:
processed_text = []
for word in lemmatized_text:
    if word not in stopwords:
        processed_text.append(word)
print (processed_text)

['challenging', 'problem', 'researcher', 'developer', 'real-time', 'dre', 'system', 'devising', 'implementing', 'effective', 'adaptive', 'resource', 'management', 'strategy', 'end-to-end', 'quality', 'service', 'qos', 'requirement', 'varying', 'operational', 'condition', 'paper', 'present', 'contribution', 'research', 'adaptive', 'resource', 'management', 'dre', 'system', 'structure', 'functionality', 'hybrid', 'adaptive', 'resourcemanagement', 'middleware', 'hyarm', 'adaptive', 'resource', 'management', 'using', 'hybrid', 'control', 'technique', 'adapting', 'fluctuation', 'resource', 'availability', 'second', 'adaptive', 'behavior', 'hyarm', 'experiment', 'dre', 'multimedia', 'system', 'video', 'real-time', 'result', 'hyarm', 'yield', 'predictable', 'stable', 'high', 'system', 'performance', 'face', 'fluctuating', 'resource', 'availability']


In [53]:
vocabulary = list(set(processed_text))
print (vocabulary)

['service', 'face', 'system', 'structure', 'fluctuating', 'resource', 'hyarm', 'strategy', 'operational', 'fluctuation', 'effective', 'technique', 'implementing', 'second', 'varying', 'functionality', 'using', 'devising', 'performance', 'resourcemanagement', 'adaptive', 'stable', 'condition', 'qos', 'research', 'behavior', 'real-time', 'middleware', 'paper', 'researcher', 'developer', 'problem', 'control', 'multimedia', 'requirement', 'result', 'high', 'challenging', 'experiment', 'dre', 'end-to-end', 'contribution', 'availability', 'yield', 'predictable', 'video', 'quality', 'adapting', 'management', 'hybrid', 'present']


In [54]:
len(vocabulary)

51

In [55]:
vocab_len = len(vocabulary)

weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

score = np.zeros((vocab_len),dtype=np.float32)
window_size = 3
covered_coocurrences = []

for i in range(0,vocab_len):
    score[i]=1
    for j in range(0,vocab_len):
        if j==i:
            weighted_edge[i][j]=0
        else:
            for window_start in range(0,(len(processed_text)-window_size)):
                
                window_end = window_start+window_size
                
                window = processed_text[window_start:window_end]
                
                if (vocabulary[i] in window) and (vocabulary[j] in window):
                    
                    index_of_i = window_start + window.index(vocabulary[i])
                    index_of_j = window_start + window.index(vocabulary[j])
                    
                    # index_of_x is the absolute position of the xth term in the window 
                    # (counting from 0) 
                    # in the processed_text
                      
                    if [index_of_i,index_of_j] not in covered_coocurrences:
                        weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
                        covered_coocurrences.append([index_of_i,index_of_j])

In [56]:
inout = np.zeros((vocab_len),dtype=np.float32)

for i in range(0,vocab_len):
    for j in range(0,vocab_len):
        inout[i]+=weighted_edge[i][j]

In [57]:

MAX_ITERATIONS = 50
d=0.85
threshold = 0.0001 #convergence threshold

for iter in range(0,MAX_ITERATIONS):
    prev_score = np.copy(score)
    
    for i in range(0,vocab_len):
        
        summation = 0
        for j in range(0,vocab_len):
            if weighted_edge[i][j] != 0:
                summation += (weighted_edge[i][j]/inout[j])*score[j]
                
        score[i] = (1-d) + d*(summation)
    
    if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition
        print("Converging at iteration "+str(iter)+"....")
        break

Converging at iteration 29....


In [58]:
for i in range(0,vocab_len):
    print("Score of "+vocabulary[i]+": "+str(score[i]))

Score of service: 0.92646396
Score of face: 0.76746017
Score of system: 2.5743918
Score of structure: 0.72268647
Score of fluctuating: 0.6502049
Score of resource: 2.7561345
Score of hyarm: 1.9326094
Score of strategy: 0.7752871
Score of operational: 0.9571497
Score of fluctuation: 0.7626338
Score of effective: 0.7236684
Score of technique: 0.79915214
Score of implementing: 0.74068487
Score of second: 0.71423787
Score of varying: 0.96057063
Score of functionality: 0.7243751
Score of using: 0.71277857
Score of devising: 0.73387665
Score of performance: 0.770985
Score of resourcemanagement: 0.7105836
Score of adaptive: 3.0542428
Score of stable: 0.7988939
Score of condition: 0.94644433
Score of qos: 0.9465402
Score of research: 0.77476484
Score of behavior: 0.7077215
Score of real-time: 1.3990164
Score of middleware: 0.70693004
Score of paper: 0.92629725
Score of researcher: 0.95136374
Score of developer: 0.854713
Score of problem: 0.869543
Score of control: 0.781213
Score of multimedia:

In [59]:

phrases = []

phrase = " "
for word in lemmatized_text:
    
    if word in stopwords:
        if phrase!= " ":
            phrases.append(str(phrase).strip().split())
        phrase = " "
    elif word not in stopwords:
        phrase+=str(word)
        phrase+=" "

print("Partitioned Phrases (Candidate Keyphrases): \n")
print(phrases)

Partitioned Phrases (Candidate Keyphrases): 

[['challenging', 'problem'], ['researcher'], ['developer'], ['real-time'], ['dre'], ['system'], ['devising'], ['implementing', 'effective', 'adaptive', 'resource', 'management', 'strategy'], ['end-to-end', 'quality'], ['service'], ['qos'], ['requirement'], ['varying', 'operational', 'condition'], ['paper', 'present'], ['contribution'], ['research'], ['adaptive', 'resource', 'management'], ['dre', 'system'], ['structure'], ['functionality'], ['hybrid', 'adaptive', 'resourcemanagement', 'middleware'], ['hyarm'], ['adaptive', 'resource', 'management', 'using', 'hybrid', 'control', 'technique'], ['adapting'], ['fluctuation'], ['resource', 'availability'], ['second'], ['adaptive', 'behavior'], ['hyarm'], ['experiment'], ['dre', 'multimedia', 'system'], ['video'], ['real-time'], ['result'], ['hyarm', 'yield', 'predictable'], ['stable'], ['high', 'system', 'performance'], ['face'], ['fluctuating']]


In [60]:
unique_phrases = []

for phrase in phrases:
    if phrase not in unique_phrases:
        unique_phrases.append(phrase)

print("Unique Phrases (Candidate Keyphrases): \n")
print(unique_phrases)

Unique Phrases (Candidate Keyphrases): 

[['challenging', 'problem'], ['researcher'], ['developer'], ['real-time'], ['dre'], ['system'], ['devising'], ['implementing', 'effective', 'adaptive', 'resource', 'management', 'strategy'], ['end-to-end', 'quality'], ['service'], ['qos'], ['requirement'], ['varying', 'operational', 'condition'], ['paper', 'present'], ['contribution'], ['research'], ['adaptive', 'resource', 'management'], ['dre', 'system'], ['structure'], ['functionality'], ['hybrid', 'adaptive', 'resourcemanagement', 'middleware'], ['hyarm'], ['adaptive', 'resource', 'management', 'using', 'hybrid', 'control', 'technique'], ['adapting'], ['fluctuation'], ['resource', 'availability'], ['second'], ['adaptive', 'behavior'], ['experiment'], ['dre', 'multimedia', 'system'], ['video'], ['result'], ['hyarm', 'yield', 'predictable'], ['stable'], ['high', 'system', 'performance'], ['face'], ['fluctuating']]


In [61]:

for word in vocabulary:
    #print word
    for phrase in unique_phrases:
        if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
            #if len(phrase)>1 then the current phrase is multi-worded.
            #if the word in vocabulary is present in unique_phrases as a single-word-phrase
            # and at the same time present as a word within a multi-worded phrase,
            # then I will remove the single-word-phrase from the list.
            unique_phrases.remove([word])
            
print("Thinned Unique Phrases (Candidate Keyphrases): \n")
print(unique_phrases)

Thinned Unique Phrases (Candidate Keyphrases): 

[['challenging', 'problem'], ['researcher'], ['developer'], ['real-time'], ['devising'], ['implementing', 'effective', 'adaptive', 'resource', 'management', 'strategy'], ['end-to-end', 'quality'], ['service'], ['qos'], ['requirement'], ['varying', 'operational', 'condition'], ['paper', 'present'], ['contribution'], ['research'], ['adaptive', 'resource', 'management'], ['dre', 'system'], ['structure'], ['functionality'], ['hybrid', 'adaptive', 'resourcemanagement', 'middleware'], ['adaptive', 'resource', 'management', 'using', 'hybrid', 'control', 'technique'], ['adapting'], ['fluctuation'], ['resource', 'availability'], ['second'], ['adaptive', 'behavior'], ['experiment'], ['dre', 'multimedia', 'system'], ['video'], ['result'], ['hyarm', 'yield', 'predictable'], ['stable'], ['high', 'system', 'performance'], ['face'], ['fluctuating']]


In [62]:
phrase_scores = []
keywords = []
for phrase in unique_phrases:
    phrase_score=0
    keyword = ''
    for word in phrase:
        keyword += str(word)
        keyword += " "
        phrase_score+=score[vocabulary.index(word)]
    phrase_scores.append(phrase_score)
    keywords.append(keyword.strip())

res = {keywords[i]: phrase_scores[i] for i in range(len(keywords))} 

# i=0
# for keyword in keywords:
#     print ("Keyword: '"+str(keyword)+"', Score: "+str(phrase_scores[i]))
#     i+=1

In [63]:
res_sorted_keys = sorted(res, key=res.get, reverse=True)
for r in res_sorted_keys[0:5]:
    print(r, res[r])

adaptive resource management using hybrid control technique 11.294519245624542
implementing effective adaptive resource management strategy 9.886425733566284
adaptive resource management 7.646785378456116
hybrid adaptive resourcemanagement middleware 5.826346695423126
dre multimedia system 5.1782031655311584


In [64]:
sorted_index = np.flip(np.argsort(phrase_scores),0)

keywords_num = 10

print("Keywords:\n")

for i in range(0,keywords_num):
    print(str(keywords[sorted_index[i]])+", ", end=' ')

Keywords:

adaptive resource management using hybrid control technique,  implementing effective adaptive resource management strategy,  adaptive resource management,  hybrid adaptive resourcemanagement middleware,  dre multimedia system,  dre system,  high system performance,  adaptive behavior,  hyarm yield predictable,  resource availability,  

In [65]:
keywords

['challenging problem',
 'researcher',
 'developer',
 'real-time',
 'devising',
 'implementing effective adaptive resource management strategy',
 'end-to-end quality',
 'service',
 'qos',
 'requirement',
 'varying operational condition',
 'paper present',
 'contribution',
 'research',
 'adaptive resource management',
 'dre system',
 'structure',
 'functionality',
 'hybrid adaptive resourcemanagement middleware',
 'adaptive resource management using hybrid control technique',
 'adapting',
 'fluctuation',
 'resource availability',
 'second',
 'adaptive behavior',
 'experiment',
 'dre multimedia system',
 'video',
 'result',
 'hyarm yield predictable',
 'stable',
 'high system performance',
 'face',
 'fluctuating']

### Ground Truth

adaptive resource management,
distributed real-time embedded system,
end-to-end quality of service,
service end-to-end quality,
hybrid adaptive resourcemanagement middleware,
hybrid control technique,
real-time video distribution system,
real-time corba specification,
video encoding/decoding,
resource reservation mechanism,
dynamic environment,
streaming service,
distribute real-time embed system,
hybrid system,
quality of service,
service quality


In [66]:
ground_truth = ['adaptive resource management', 'distributed real-time embedded system', 'end-to-end quality of service', 'service end-to-end quality', 'hybrid adaptive resourcemanagement middleware', 'hybrid control technique', 'real-time video distribution system', 'real-time corba specification', 'video encoding/decoding', 'resource reservation mechanism', 'dynamic environment', 'streaming service', 'distribute real-time embed system', 'hybrid system', 'quality of service','service quality']

In [67]:
len(ground_truth)

16

In [68]:
list(set(keywords) & (set(ground_truth)))

['adaptive resource management',
 'hybrid adaptive resourcemanagement middleware']

### Applying the above on KP_20k dataset

In [69]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Advanced NLP-Project/data5L.csv")

In [70]:
data.head()

Unnamed: 0,abstract,keyword,title
0,This paper proposes using virtual reality to e...,telepresence;animation;avatars;application sha...,virtually enhancing the perception of user act...
1,This paper presents an improved architecture o...,sigma delta modulators;analog-to-digital conve...,Dynamic range improvement of multistage multib...
2,"In this paper, we discuss the motivation and t...",enterprise information integration and interop...,An ontology modelling perspective on business ...
3,An overview of the self-organizing map algorit...,self-organizing map;learning vector quantization,The self-organizing map
4,The amygdala comprises part of an extended net...,social brain;amygdala;behavior;facial expression,The Amygdala and Development of the Social Brain


In [71]:
data['abs_keyword_count'] = data.keyword.str.strip().str.split(';').apply(len)

In [72]:
data.sample(5)

Unnamed: 0,abstract,keyword,title,abs_keyword_count
78867,Forage quality in grassland-savanna ecosystems...,landscape;modelling;monitoring;ecology;resourc...,Remote sensing of forage nutrients: Combining ...,6
319784,Application of network coding in wireless two-...,channel coding;ergodic capacity;log-likelihood...,Soft Network Coding in Wireless Two-Way Relay ...,7
437631,The increasing pervasiveness of location-acqui...,network;pervasive;applications;sequential patt...,trajectory pattern mining,29
278568,Load balancing middleware is used extensively ...,middleware;patterns;scalability;corba;load bal...,Issues in the design of adaptive middleware lo...,5
170061,The concept of Just-In-Time Minus (JIT(-)) cus...,e-commerce;customer service;information system...,An Intranet based information system supportin...,5


In [73]:
max(data['abs_keyword_count']), min(data['abs_keyword_count'])

(110, 1)

In [74]:
def clean(text):
    text = text.lower()
    printable = set(string.printable)
    text = filter(lambda x: x in printable, text)
    text = "".join(list(text))
    return text

def TextScoring(text):
  cleaned_text = clean(text)
  text = word_tokenize(cleaned_text)
  pos_tag = nltk.pos_tag(text)
  wordnet_lemmatizer = WordNetLemmatizer()
  adjective_tags = ['JJ','JJR','JJS']
  lemmatized_text = []
  for word in pos_tag:
      if word[1] in adjective_tags:
          lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
      else:
          lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0])))
  pos_tag = nltk.pos_tag(lemmatized_text)
  stopwords = []
  wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS'] 
  for word in pos_tag:
      if word[1] not in wanted_POS:
          stopwords.append(word[0])
  punctuations = list(str(string.punctuation))
  stopwords = stopwords + punctuations
  processed_text = []
  for word in lemmatized_text:
      if word not in stopwords:
          processed_text.append(word)
  vocabulary = list(set(processed_text))
  vocab_len = len(vocabulary)

  weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

  score = np.zeros((vocab_len),dtype=np.float32)
  window_size = 3
  covered_coocurrences = []

  for i in range(0,vocab_len):
      score[i]=1
      for j in range(0,vocab_len):
          if j==i:
              weighted_edge[i][j]=0
          else:
              for window_start in range(0,(len(processed_text)-window_size)):
                  
                  window_end = window_start+window_size
                  
                  window = processed_text[window_start:window_end]
                  
                  if (vocabulary[i] in window) and (vocabulary[j] in window):
                      
                      index_of_i = window_start + window.index(vocabulary[i])
                      index_of_j = window_start + window.index(vocabulary[j])
                      
                      # index_of_x is the absolute position of the xth term in the window 
                      # (counting from 0) 
                      # in the processed_text
                        
                      if [index_of_i,index_of_j] not in covered_coocurrences:
                          weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
                          covered_coocurrences.append([index_of_i,index_of_j])

  inout = np.zeros((vocab_len),dtype=np.float32)

  for i in range(0,vocab_len):
      for j in range(0,vocab_len):
          inout[i]+=weighted_edge[i][j]

  MAX_ITERATIONS = 50
  d=0.85
  threshold = 0.0001 #convergence threshold

  for iter in range(0,MAX_ITERATIONS):
      prev_score = np.copy(score)
      
      for i in range(0,vocab_len):
          
          summation = 0
          for j in range(0,vocab_len):
              if weighted_edge[i][j] != 0:
                  summation += (weighted_edge[i][j]/inout[j])*score[j]
                  
          score[i] = (1-d) + d*(summation)
      
      if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition
          #print("Converging at iteration "+str(iter)+"....")
          break
  phrases = []

  phrase = " "
  for word in lemmatized_text:
      
      if word in stopwords:
          if phrase!= " ":
              phrases.append(str(phrase).strip().split())
          phrase = " "
      elif word not in stopwords:
          phrase+=str(word)
          phrase+=" "

  unique_phrases = []
  for phrase in phrases:
      if phrase not in unique_phrases:
          unique_phrases.append(phrase)

  for word in vocabulary:
      #print word
      for phrase in unique_phrases:
          if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
              unique_phrases.remove([word])

  phrase_scores = []
  keywords = []
  for phrase in unique_phrases:
      phrase_score=0
      keyword = ''
      for word in phrase:
          keyword += str(word)
          keyword += " "
          phrase_score+=score[vocabulary.index(word)]
      phrase_scores.append(phrase_score)
      keywords.append(keyword.strip())

  res = {keywords[i]: phrase_scores[i] for i in range(len(keywords))}
  sorted_index = np.flip(np.argsort(phrase_scores),0)
  keywords_num = len(keywords)
  final_keywords = []
  for i in range(0,keywords_num):
    final_keywords.append(str(keywords[sorted_index[i]]))
  return final_keywords


In [75]:
data_5 = data[0:5]

In [76]:
data_5['TextScoring'] = data_5['abstract'].apply(lambda x: ",".join(TextScoring(x)))

In [77]:
data_5

Unnamed: 0,abstract,keyword,title,abs_keyword_count,TextScoring
0,This paper proposes using virtual reality to e...,telepresence;animation;avatars;application sha...,virtually enhancing the perception of user act...,5,"user action,recorded action,remote synchronous..."
1,This paper presents an improved architecture o...,sigma delta modulators;analog-to-digital conve...,Dynamic range improvement of multistage multib...,5,"leakage quantization noise problem,in-band qua..."
2,"In this paper, we discuss the motivation and t...",enterprise information integration and interop...,An ontology modelling perspective on business ...,5,"business reporting language structure,extensib..."
3,An overview of the self-organizing map algorit...,self-organizing map;learning vector quantization,The self-organizing map,2,"self-organizing map algorithm,paper,issue,over..."
4,The amygdala comprises part of an extended net...,social brain;amygdala;behavior;facial expression,The Amygdala and Development of the Social Brain,4,"social cognitive network,social behavior appro..."


In [78]:
data_1000 = data[0:1000]

In [79]:
%%time
data_1000['TextScoring'] = data_1000['abstract'].apply(lambda x: ",".join(TextScoring(x)))

CPU times: user 4min 2s, sys: 85.2 ms, total: 4min 2s
Wall time: 4min 3s


In [80]:
data_1000.TextScoring.sample(5)

253    labelled petri net,equivalence land preorders,...
876    such power aware system,power aware system,pow...
475    new delay-dependent stability criterion,uncert...
885    computer science privacy research,privacy solu...
86     adjustable regulated power output,input curren...
Name: TextScoring, dtype: object

In [None]:
%%time
#data['TextScoring'] = data['abstract'].apply(lambda x: ",".join(TextScoring(x)))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs


In [81]:
data_1000.sample(5)

Unnamed: 0,abstract,keyword,title,abs_keyword_count,TextScoring
28,"In this paper, a new grid generation system is...",unstructured surface mesh generation;geometry ...,Parallel generation of unstructured surface grids,5,"new grid generation system,unstructured triang..."
188,Edit distance based string similarity join is ...,string joins;probabilistic strings;approximate...,probabilistic string similarity joins,3,"probabilistic data,relational database engine,..."
957,The mass transfer between immiscible two-liqui...,bubble;immiscible two-liquid interface;ripple;...,Micro droplets generated on a rising bubble th...,5,"numerous micro water droplet,fine water drople..."
451,This paper reports the initial results of a re...,artifact mediated collaboration;hci design pat...,user requirements for a web based spreadsheet-...,4,"cooperative interaction design,scenario-based ..."
780,Achieving ultra-large-scale software systems w...,service;software engineering;development proce...,a service driven development process (sddp) mo...,4,"proposed process model,special development pro..."


In [82]:
data_1000['keyword'] = data_1000['keyword'].str.replace(';',',')

In [83]:
lst = data_1000.keyword.apply(lambda x: x.strip(',').split(','))

In [84]:
data_1000['keyword'] = data_1000.keyword.apply(lambda x: x.strip(',').split(','))
data_1000['TextScoring'] = data_1000.TextScoring.apply(lambda x: x.strip(',').split(','))

In [85]:
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return ' '.join([lower(x) for x in s]).rstrip()

In [86]:
def remove_empty(a_list):
    new_list = []
    for i in a_list:
        if len(i) > 0:
            if len(i[0]) >0:
                new_list.append(normalize_answer(i))   
    return new_list

In [87]:
def dedup(kp_list):
    dedupset = set()
    kp_list_dedup = []
    for kp in kp_list:
        if kp in dedupset:
            continue       
        kp_list_dedup.append(kp)
        dedupset.add(kp)
    return kp_list_dedup

In [88]:
dedup(lst[0])

['telepresence',
 'animation',
 'avatars',
 'application sharing',
 'collaborative virtual environments']

In [89]:
def get_score_full(candidates, references, maxDepth = 30):
    precision = []
    recall = []
    reference_set = set(dedup(references))
    candidates = dedup(candidates)
    referencelen = len(reference_set)
    true_positive = 0
    for i in range(maxDepth):
        if len(candidates) > i:
            kp_pred = candidates[i]     
            if kp_pred in reference_set:
                true_positive += 1
        precision.append(true_positive/float(i + 1))
        recall.append(true_positive/float(referencelen))
    return precision, recall

In [90]:
def evaluate(candidates, references,data):
    precision_scores, recall_scores, f1_scores = {1:[], 3:[], 5:[], 10:[], 30:[]}, {1:[], 3:[], 5:[], 10:[], 30:[]}, {1:[], 3:[], 5:[], 10:[], 30:[]}
    for url in range(len(data)):
      candidate = remove_empty(candidates[url])
      reference = remove_empty(references[url])
    p, r = get_score_full(candidate, reference) 
    for i in [1,3,5,10,30]:
        precision = p[i-1]
        recall = r[i-1]
        if precision + recall > 0:
            f1_scores[i].append((2 * (precision * recall)) / (precision + recall))
        else:
            f1_scores[i].append(0)
        precision_scores[i].append(precision)
        recall_scores[i].append(recall)
    print("########################\nMetrics")
    for i in precision_scores:
        print("@{}".format(i))
        print("F1:{}".format(np.mean(f1_scores[i])))
        print("P:{}".format(np.mean(precision_scores[i])))
        print("R:{}".format(np.mean(recall_scores[i])))
    print("#########################")

In [91]:
evaluate(data_1000['keyword'],data_1000['TextScoring'],data_1000)

########################
Metrics
@1
F1:0.0
P:0.0
R:0.0
@3
F1:0.1111111111111111
P:0.6666666666666666
R:0.06060606060606061
@5
F1:0.15789473684210525
P:0.6
R:0.09090909090909091
@10
F1:0.13953488372093023
P:0.3
R:0.09090909090909091
@30
F1:0.09523809523809525
P:0.1
R:0.09090909090909091
#########################


In [92]:
evaluate(data_1000['keyword'],data_1000['TextScoring'],data_1000)

########################
Metrics
@1
F1:0.0
P:0.0
R:0.0
@3
F1:0.1111111111111111
P:0.6666666666666666
R:0.06060606060606061
@5
F1:0.15789473684210525
P:0.6
R:0.09090909090909091
@10
F1:0.13953488372093023
P:0.3
R:0.09090909090909091
@30
F1:0.09523809523809525
P:0.1
R:0.09090909090909091
#########################


In [97]:
!unzip '/content/drive/My Drive/Colab Notebooks/Advanced NLP-Project/Project/kp20k_new.zip'

Archive:  /content/drive/My Drive/Colab Notebooks/Advanced NLP-Project/Project/kp20k_new.zip
  inflating: kp20k_testing.json      
  inflating: kp20k_training.json     
  inflating: kp20k_validation.json   


In [98]:
import json 
test = []
for line in open('/content/kp20k_testing.json', 'r'):
    test.append(json.loads(line))

In [99]:
import json 
validation = []
for line in open('/content/kp20k_validation.json', 'r'):
    validation.append(json.loads(line))

In [100]:
test_data = pd.DataFrame(test)

In [101]:
val_data = pd.DataFrame(validation)

In [102]:
test_data

Unnamed: 0,abstract,keyword,title
0,A feedback vertex set of a graph G is a set S ...,feedback vertex set;decycling set;2-degenerate...,A feedback vertex set of 2-degenerate graphs
1,This article proposes techniques to predict th...,performance;analytical modeling;pending hit;da...,Hybrid Analytical Modeling of Pending Cache Hi...
2,Autoimmune polyendocrinopathy candidiasis ecto...,apeced;aire;chronic mucocutaneous candidiasis;...,Autoimmune polyendocrinopathy candidiasis ecto...
3,"In this paper, we consider an enthalpy formula...",casting;thermal;conduction;convection;finite e...,Numerical solution of a three-dimensional soli...
4,"In this research, a new type of manufacturing ...",feature recognition;rib;aircraft structural pa...,Definition and recognition of rib features in ...
...,...,...,...
19995,Energy efficiency and transmission delay are v...,energy efficiency;delay;unreliable links;wirel...,Energy-delay tradeoff in wireless multihop net...
19996,This paper describes the design and implementa...,e-medical records;e-health;e-clinic;web-based;...,A Cyber Medical Center
19997,This work describes a detailed simulation-base...,wireless lan;quality of service;medium access ...,adapting wlan mac parameters to enhance voip c...
19998,This paper describes a conceptually simple but...,interior point methods;ellipsoid method;multio...,An interior point multiobjective programming a...


In [103]:
test_data.isnull().sum()

abstract    0
keyword     0
title       0
dtype: int64

In [104]:
val_data

Unnamed: 0,abstract,keyword,title
0,We investigate the problem of delay constraine...,algorithms;design;performance;sensor networks;...,Real-Time Data Aggregation in Contention-Based...
1,This paper describes a method for detecting ev...,biomedical text;machine learning;information e...,word sense disambiguation for event trigger wo...
2,The lack of architecturally-significant mechan...,architectural styles;architectural aspects;poi...,composing architectural aspects based on style...
3,This paper describes our use of pen-based elec...,computer science;present;groupware;use;technol...,using pen-based computers across the computer ...
4,We show how to connect the syntactic and the f...,operational semantics;program transformation;r...,On the syntactic and functional correspondence...
...,...,...,...
19995,This paper presents a field-programmable gate ...,boolean satisfiability (sat);field-programmabl...,FPGA PLB architecture evaluation and area opti...
19996,A simple model of inductor-coupled bistable os...,bistable oscillators;inductor coupling;pulse p...,Pulse wave propagation in a large number of co...
19997,We present a novel technique for encoding and ...,constant weight codes;encoding algorithms;diss...,A Coding Algorithm for Constant Weight Vectors...
19998,We combine multilayer perceptrons and self-org...,bankruptcy prediction;financial crisis;multila...,Bankruptcy visualization and prediction using ...


In [105]:
%%time
#test_data['TextScoring'] = test_data['abstract'].apply(lambda x: ",".join(TextScoring(x)))

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.11 µs


In [None]:
#test_data['keyword'] = test_data['keyword'].str.replace(';',',')

In [None]:
#test_data['keyword'] = test_data.keyword.apply(lambda x: x.strip(';').split(','))
#test_data['TextScoring'] = test_data.TextScoring.apply(lambda x: x.strip(',').split(','))

In [None]:
#evaluate(test_data['keyword'],test_data['TextScoring'],test_data)

########################
Metrics
@1
F1:0.0
P:0.0
R:0.0
@3
F1:0.0
P:0.0
R:0.0
@5
F1:0.0
P:0.0
R:0.0
@10
F1:0.0
P:0.0
R:0.0
@30
F1:0.0
P:0.0
R:0.0
#########################


In [106]:
!unzip '/content/drive/My Drive/Colab Notebooks/Advanced NLP-Project/Project/all_title_abstract_keyword_clean.json.zip'

Archive:  /content/drive/My Drive/Colab Notebooks/Advanced NLP-Project/Project/all_title_abstract_keyword_clean.json.zip
  inflating: all_title_abstract_keyword_clean.json  


In [107]:
import json 
all_data = []
for line in open('/content/all_title_abstract_keyword_clean.json', 'r'):
    all_data.append(json.loads(line))

In [108]:
all_data = all_data[0]

In [109]:
import pandas as pd

In [110]:
all_data1 = pd.DataFrame(all_data)

In [111]:
all_data1.shape

(578015, 3)

In [112]:
all_data11 = all_data1.sample(20000)

In [None]:
%%time
all_data11['TextScoring'] = all_data11['abstract'].apply(lambda x: ",".join(TextScoring(x)))

CPU times: user 1h 16min 15s, sys: 1min 8s, total: 1h 17min 24s
Wall time: 1h 17min 32s


In [None]:
all_data11['keyword'] = all_data11['keyword'].str.replace(';',',')

In [None]:
all_data11['keyword'] = all_data11.keyword.apply(lambda x: x.strip(';').split(','))
all_data11['TextScoring'] = all_data11.TextScoring.apply(lambda x: x.strip(',').split(','))

In [None]:
all_data11

Unnamed: 0,abstract,keyword,title,TextScoring
213624,Purpose - The purpose of this paper is to stud...,"[induction, electromagnetic brake, electromagn...",New DC electromagnetic wiping system for hot-d...,"[liquid zinc layer thanks, liquid zinc thanks,..."
431544,The intermolecular potential between a 18-crow...,"[ab initio fitted potential, simulation, solva...",The hydration structure of 18-crown-6/K+ compl...,"[water molecule, monte carlo simulation method..."
457042,The article formulates a dynamic mathematical ...,"[production, consumption, multilateral exchang...",The dynamics of multilateral exchange,"[many player produce, many good, third player,..."
537182,We model and solve the problems of preemptive ...,"[scheduling, parallel machines, uniform machin...",A minimum-cost network flow approach to preemp...,"[minimum-cost network flow problem, correspond..."
233154,"In deep submicron designs, the interconnect wi...","[static timing analysis, gate delay, cmos inve...",A novel model for computing the effective capa...,"[nonlinear gate output, effective capacitance ..."
...,...,...,...,...
406737,This paper describes an exploratory study to i...,"[student-student interaction, student experien...",exploring factors that influence computer scie...,"[student factor best predict intention, resear..."
355931,This study discusses preservice teachers' achi...,"[pedagogy and technology, preservice teacher e...",Understanding preservice teachers' technology ...,"[technological pedagogical content knowledge, ..."
551380,"In this paper, signal processing techniques wh...","[signal processing, automatic speech recogniti...",Signal processing techniques for robust speech...,"[suitable signal processing technique, signal ..."
431264,Designing low end-to-end latency system archit...,"[virtual reality, image-based rendering, latency]",A shared-scene-graph image-warping architectur...,"[low end-to-end latency system architecture, i..."


In [None]:
all_data11.isnull().sum()

abstract       0
keyword        0
title          0
TextScoring    0
dtype: int64

In [None]:
def evaluate(candidates, references, data):
    precision_scores, recall_scores, f1_scores = {1:[], 3:[], 5:[], 10:[], 30:[]}, {1:[], 3:[], 5:[], 10:[], 30:[]}, {1:[], 3:[], 5:[], 10:[], 30:[]}
    for url in range(len(data)):
        candidate = remove_empty(candidates.iloc[url])
        reference = remove_empty(references.iloc[url])
        p, r = get_score_full(candidate, reference) 
        for i in [1,3,5,10,30]:
            precision = p[i-1]
            recall = r[i-1]
            if precision + recall > 0:
                f1_scores[i].append((2 * (precision * recall)) / (precision + recall))
            else:
                f1_scores[i].append(0)
            precision_scores[i].append(precision)
            recall_scores[i].append(recall)
    print("########################\nMetrics")
    for i in precision_scores:
        print("@{}".format(i))
        print("F1:{}".format(np.mean(f1_scores[i])))
        print("P:{}".format(np.mean(precision_scores[i])))
        print("R:{}".format(np.mean(recall_scores[i])))
    print("#########################")

In [None]:
evaluate(all_data11['TextScoring'],all_data11['keyword'],all_data11)

########################
Metrics
@1
F1:0.012993202211297291
P:0.0357
R:0.008161493688695167
@3
F1:0.03757045856962576
P:0.0482
R:0.03253647716411565
@5
F1:0.05067614304862105
P:0.04989
R:0.0552404362214484
@10
F1:0.06080353974223194
P:0.04614
R:0.09869307090647352
@30
F1:0.04601481791677741
P:0.027956666666666668
R:0.16356677237698772
#########################


In [None]:
data_1000 = all_data1.sample(2000)

In [None]:
%%time
data_1000['TextScoring'] = data_1000['abstract'].apply(lambda x: ",".join(TextScoring(x)))

CPU times: user 7min 37s, sys: 282 ms, total: 7min 37s
Wall time: 7min 39s


In [None]:
data_1000['keyword'] = data_1000['keyword'].str.replace(';',',')

In [None]:
data_1000['keyword'] = data_1000.keyword.apply(lambda x: x.strip(';').split(','))
data_1000['TextScoring'] = data_1000.TextScoring.apply(lambda x: x.strip(',').split(','))

In [None]:
evaluate(data_1000['TextScoring'],data_1000['keyword'],data_1000)

########################
Metrics
@1
F1:0.011723412698412698
P:0.031
R:0.007378787878787879
@3
F1:0.040255180974939034
P:0.049999999999999996
R:0.035182545232545236
@5
F1:0.05223056415548676
P:0.0506
R:0.057313060238718135
@10
F1:0.058552584509061714
P:0.04395
R:0.09593110681171035
@30
F1:0.044498287357803576
P:0.0269
R:0.1597765103678985
#########################


In [None]:
########################
Metrics
@1
F1:0.059900282565187916
P:0.45
R:0.03244000175888385
@3
F1:0.12487926811722572
P:0.37
R:0.07766035767202357
@5
F1:0.16587912407745475
P:0.32600000000000007
R:0.11712560332740846
@10
F1:0.2282296667787783
P:0.28900000000000003
R:0.20330807026384912
@30
F1:0.25265837540621766
P:0.1975
R:0.3895969630102131
#########################

In [None]:
import json 
all_data = [] 
for line in open('/content/drive/My Drive/Colab Notebooks/Advanced NLP-Project/Project/KPTimes.test.jsonl', 'r'):
    all_data.append(json.loads(line))

FileNotFoundError: ignored

In [None]:
data_df = pd.DataFrame(all_data)

NameError: ignored

In [None]:
data_df

NameError: ignored

## TfIdf

In [None]:
import pandas as pd
import re
import math

from orderedset import OrderedSet

'''
keywords extraction from a document using TF-IDF from scratch (completely from scratch). 
'''

def clean(text):

	text = text.lower() # convert all words to lower case, 
	
	text = re.sub(r'\s+', ' ', text) # replace or substitute all spaces, tabs, indents (\s) with ' ' (space) 

	text = re.sub(r'\d', ' ', text) # replace all digits by ' '

	text = re.sub(r'[^a-zA-Z. ]+', '', text) # replace all non words (\W) with ' '. (note: small w is for all words. capital W is for all non-words)
	
	#print(text)

	return text

# list of all words
def get_sentence_of_words(text):

	sentence = list() # list of sentences
	words = list() # list of words in each sentence.
  sentence_list = list()
  temp = text.strip().split(". ") # temporary list of sentences. 
  for sent in temp:

		words = sent.strip().split(" ") # getting the words in sentences.

		words = [i for i in words if len(i) > 1]

		if(len(words) > 1): 
			sentence.append(words) # sentence is a list of lists. (contains a list of sentences in which each sentence is a list of words)

		sentence_list.append(sent)	

	#print(sentence, len(sentence))	
	return sentence, sentence_list


def vectorize(sentence):

	# set of unique words in the whole document.
	unique_words = OrderedSet() 

	for sent in sentence:
		for word in sent:
			
			unique_words.add(word)

	unique_words = list(unique_words) # converting the set to a list to make it easier to work with it. 

	#print(unique_words, len(unique_words))

	# a list of lists that contains the vectorized form of each sentence in the document. 
	vector = list()


	# in the vectorized representation, we consider the bag of words (unique words in the text).
	# then, we count the occurenc of each word in a sentence and represent it in a vector whose length = length(unique_words)
	# ex: sent1 = "i am a boy"
	#     sent2 = "i am a girl"
	# unique_words = ["i", "am", "a", "boy", "girl"]
	# vector representation of sent1 = [1, 1, 1, 1, 0]
	# vector representation of sent2 = [1, 1, 1, 0, 1]

	for sent in sentence: # iterate for every sentence in the document
		temp_vector = [0] * len(unique_words) # create a temporary vector to calculate the occurence of each word in that sentence. 
		
		for word in sent: # iterate for every word in the sentence. 

			temp_vector[unique_words.index(word)] += 1	

		vector.append(temp_vector) # add the temporary vector to the list of vectors for each sentence (list of lists)

	#print(vector)	


	return vector, unique_words	

# function to calculate the tf scores
def tf(vector, sentence, unique_words):

	tf = list()

	no_of_unique_words = len(unique_words) 

	for i in range(len(sentence)):

		tflist = list()
		sent = sentence[i]
		count = vector[i]

		for word in sent:
			'''
			if(count[sent.index(word)] == 0):
				count[sent.index(word)] = 1
			'''
			score = count[sent.index(word)]/ float(len(sent)) # tf = no. of occurence of a word/ total no. of words in the sentence. 

			if(score == 0):
				score = 1/ float(len(sentence))

			tflist.append(score)  

		tf.append(tflist)

	# print(tf)	
	
	return tf	


#function to calculate idf. 
def idf(vector, sentence, unique_words):

	# idf = log(no. of sentences / no. of sentences in which the word appears).

	no_of_sentences = len(sentence)

	idf = list()

	for sent in sentence:
		
		idflist = list()

		for word in sent:

			count = 0 # no. of times the word occurs in the entire text.

			for k in sentence:
				if(word in k):
					count += 1
		

			score = math.log(no_of_sentences/float(count)) # caclulating idf scores

			idflist.append(score)

		idf.append(idflist)	

	# print(idf)	

	return idf


# function to calculate the tf-idf scores.
def tf_idf(tf, idf):

	# tf-idf = tf(w) * idf(w)

	tfidf = [[0 for j in range(len(tf[i]))] for i in range(len(tf))]

	for i in range(len(tf)):
		for j in range(len(tf[i])):

			tfidf[i][j] = tf[i][j] * float(idf[i][j])

	# print(tfidf)		

	return tfidf	


def extract_keywords(tfidf, processed_text):
	
	mapping = {}

	for i in range(len(tfidf)):
		for j in range(len(tfidf[i])):

			mapping[processed_text[i][j]] = tfidf[i][j]

	#print(mapping)

	word_scores = sorted(mapping.values(), reverse = True)
	words = []

	scores_to_word = {}

	for i in range(len(tfidf)):
		for j in range(len(tfidf[i])):

			scores_to_word[tfidf[i][j]] = processed_text[i][j]

	for i in range(len(word_scores)):
		if(word_scores[i] != 0):
			words.append(scores_to_word[word_scores[i]])
		else:
			words.append(scores_to_word[word_scores[i]])
			break

	# print(words)	

	words = OrderedSet(words)

	for i in mapping:
		if(mapping[i] == 0):
			words.append(i)		
	
	return words, mapping


def save_keywords(words, mapping):

	scores = []

	for word in words:
		scores.append(mapping[word])

	# print(words, scores)

	d = {'Words': words, 'Scores': scores}

	data = pd.DataFrame(d)

	data.to_csv('keywords.csv', sep = '\t')

	print(data)


In [None]:
data_1000_1 = data_1000.copy()

In [None]:
data_1000_1.abstract = data_1000_1.abstract.apply(lambda x: clean(x))

In [None]:
data_1000_1['processed_text'], data_1000_1['sentence_list'] = zip(*data_1000_1.abstract.apply(lambda x : get_sentence_of_words(x)))

In [None]:
lst = list(data_1000_1.sentence_list)

In [None]:
sentence_to_index = defaultdict(list)

for k, i in enumerate(lst):
  sentence_to_index[i].append(k)

TypeError: ignored

In [None]:
from collections import defaultdict

sentence_to_index = defaultdict(list)
for k, i in enumerate(list(data_1000_1.sentence_list)):
  sentence_to_index[i].append(k)

TypeError: ignored

In [None]:
data_1000_1.sentence_list

In [None]:
sentence_to_index = {i:k for k, i in enumerate(list(data_1000_1.sentence_list))}

# vector, unique_words = vectorize(processed_text)

# tf = tf(vector, processed_text, unique_words)

# idf = idf(vector, processed_text, unique_words)

# tfidf = tf_idf(tf, idf)	

# keywords, mapping = extract_keywords(tfidf, processed_text)

# #save_keywords(keywords, mapping)

TypeError: ignored

In [None]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text


data_1000_1['abstract'] = data_1000_1['abstract'].apply(lambda x:pre_process(x))

#show the first 'text'
data_1000_1['abstract'][2]


'in this paper we discuss the motivation and the fundamentals of an ontology representation of business reporting data and metadata structures as defined in the extensible business reporting language xbrl standard the core motivation for an ontology representation is the enhanced potential for integrated analytic applications that build on quantitative reporting data combined with structured and unstructured data from additional sources applications of this kind will enable significant enhancements in regulatory compliance management as they enable business analytics combined with inference engines for statistical but also for logical inferences in order to define a suitable ontology representation of business reporting language structures an analysis of the logical principles of the reporting metadata taxonomies and further classification systems is presented based on this analysis a representation of the generally accepted accounting principles taxonomies in xbrl by an ontology provi

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("/content/drive/My Drive/Colab Notebooks/Advanced NLP-Project/Project/sklearn_stopwords.txt")

#get the text column 
docs=data_1000_1['abstract'].tolist()

#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
cv=CountVectorizer(max_df=0.85,stop_words=stopwords)
word_count_vector=cv.fit_transform(docs)

In [None]:
word_count_vector.shape

(1000, 12614)

In [None]:
cv=CountVectorizer(max_df=0.85,stop_words=stopwords,max_features=10000)
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape

(1000, 10000)

In [None]:
list(cv.vocabulary_.keys())[:10]

['paper',
 'proposes',
 'using',
 'virtual',
 'reality',
 'enhance',
 'perception',
 'actions',
 'distant',
 'users']

In [None]:
list(cv.get_feature_names())[2000:2015]

['dataspace',
 'datatype',
 'datavalue',
 'date',
 'dates',
 'daunorubicin',
 'davidov',
 'day',
 'dayneka',
 'days',
 'db',
 'dbc',
 'dbms',
 'dbn',
 'dbpedia']

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [None]:
tfidf_transformer.idf_

array([7.2156076, 7.2156076, 7.2156076, ..., 7.2156076, 7.2156076,
       7.2156076])

In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
# you only needs to do this once
feature_names=cv.get_feature_names()

# get the document that we want to extract keywords from
doc=docs[0]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

# now print the results
for k in keywords:
    print(k,keywords[k])

actions 0.468
playback 0.275
recorded 0.268
collaboration 0.257
remote 0.244
shared 0.228
application 0.226
virtual 0.209
user 0.169
space 0.167


In [None]:
# put the common code into several methods
def get_keywords(idx):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def print_results(idx,keywords):
    # now print the results
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])

In [None]:
idx=120
keywords=get_keywords(idx)
print_results(idx,keywords)


===Keywords===
mutation 0.628
analysis 0.284
current 0.198
highlighted 0.179
reviewed 0.166
suffer 0.157
outlined 0.157
automation 0.154
unfortunately 0.15
severe 0.148


In [None]:

#generate tf-idf for all documents in your list. docs_test has 500 documents
tf_idf_vector=tfidf_transformer.transform(cv.transform(docs))

results=[]
for i in range(tf_idf_vector.shape[0]):
    
    # get vector for a single document
    curr_vector=tf_idf_vector[i]
    
    #sort the tf-idf vector by descending order of scores
    sorted_items=sort_coo(curr_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,100)
    
    
    results.append(keywords)

df=pd.DataFrame(zip(docs,results),columns=['doc','keywords'])
df

Unnamed: 0,doc,keywords
0,this paper proposes using virtual reality to e...,"{'actions': 0.468, 'playback': 0.275, 'recorde..."
1,this paper presents an improved architecture o...,"{'modulator': 0.383, 'oscillation': 0.339, 'hq..."
2,in this paper we discuss the motivation and th...,"{'ontology': 0.441, 'reporting': 0.428, 'busin..."
3,an overview of the selforganizing map algorith...,"{'selforganizing': 0.496, 'papers': 0.425, 'ov..."
4,the amygdala comprises part of an extended net...,"{'social': 0.44, 'disruption': 0.224, 'amygdal..."
...,...,...
995,because of its simplicity and intuitive approa...,"{'rendering': 0.467, 'visibility': 0.456, 'poi..."
996,this study observed the expression of transfor...,"{'pheo': 0.566, 'tnf': 0.471, 'tgf': 0.377, 'p..."
997,threedimensional simulation models are hard to...,"{'simulation': 0.314, 'interactive': 0.232, 'v..."
998,nearest neighbor nn search in highdimensional ...,"{'query': 0.234, 'space': 0.23, 'lsh': 0.223, ..."


In [None]:
data_1000_1

Unnamed: 0,abstract,keyword,title,abs_keyword_count,TextScoring,processed_text,sentence_list
0,this paper proposes using virtual reality to e...,"[telepresence, animation, avatars, application...",virtually enhancing the perception of user act...,5,"[user action, recorded action, remote synchron...","[[this, paper, proposes, using, virtual, reali...",[this paper proposes using virtual reality to ...
1,this paper presents an improved architecture o...,"[sigma delta modulators, analog-to-digital con...",Dynamic range improvement of multistage multib...,5,"[leakage quantization noise problem, in-band q...","[[this, paper, presents, an, improved, archite...",[this paper presents an improved architecture ...
2,in this paper we discuss the motivation and th...,[enterprise information integration and intero...,An ontology modelling perspective on business ...,5,"[business reporting language structure, extens...","[[in, this, paper, we, discuss, the, motivatio...",[in this paper we discuss the motivation and t...
3,an overview of the selforganizing map algorith...,"[self-organizing map, learning vector quantiza...",The self-organizing map,2,"[self-organizing map algorithm, paper, overvie...","[[an, overview, of, the, selforganizing, map, ...",[an overview of the selforganizing map algorit...
4,the amygdala comprises part of an extended net...,"[social brain, amygdala, behavior, facial expr...",The Amygdala and Development of the Social Brain,4,"[social cognitive network, social behavior app...","[[the, amygdala, comprises, part, of, an, exte...",[the amygdala comprises part of an extended ne...
...,...,...,...,...,...,...,...
995,because of its simplicity and intuitive approa...,"[display algorithms, point-based rendering, de...",Real-time point-based rendering using visibili...,4,"[extended point-based rendering method, distan...","[[because, of, its, simplicity, and, intuitive...",[because of its simplicity and intuitive appro...
996,this study observed the expression of transfor...,"[pheochromocytoma, tgf-?, tnf-?, proliferation...",Expression and Effect of Transforming Growth F...,6,"[primary human pheo cell, human pheo cell, phe...","[[this, study, observed, the, expression, of, ...",[this study observed the expression of transfo...
997,threedimensional simulation models are hard to...,"[visualisation, ising model, cuda, gpu, instru...",Interactive visualisation of spins and cluster...,5,"[computational simulation model, other visual ...","[[threedimensional, simulation, models, are, h...",[threedimensional simulation models are hard t...
998,nearest neighbor nn search in highdimensional ...,"[theory, algorithms, experimentation, locality...",Efficient and Accurate Nearest Neighbor and Cl...,6,"[high-dimensional nn search, strong quality gu...","[[nearest, neighbor, nn, search, in, highdimen...",[nearest neighbor nn search in highdimensional...


In [None]:
df['keys'] = ''

In [None]:
keys = []
for i in range(len(df)):
  keys1 = []
  keys1.append(df['keywords'].iloc[i].keys())
  keys.append(keys1)

In [None]:
tfidf = pd.DataFrame(keys)

In [None]:
tfidf.rename(columns={0: "tfidf"},inplace=True)

In [None]:
tfidf

Unnamed: 0,tfidf
0,"(actions, playback, recorded, collaboration, r..."
1,"(modulator, oscillation, hqcrffbased, quantiza..."
2,"(ontology, reporting, business, representation..."
3,"(selforganizing, papers, overview, map, issue,..."
4,"(social, disruption, amygdala, cortex, stimuli..."
...,...
995,"(rendering, visibility, pointbased, depthbased..."
996,"(pheo, tnf, tgf, proliferation, apoptosis, tis..."
997,"(simulation, interactive, visual, models, syst..."
998,"(query, space, lsh, lsbtree, guarantees, magni..."


In [None]:
evaluate(tfidf['tfidf'],data_1000_1['keyword'],data_1000_1)

########################
Metrics
@1
F1:0.33333333333333337
P:1.0
R:0.2
@3
F1:0.25
P:0.3333333333333333
R:0.2
@5
F1:0.20000000000000004
P:0.2
R:0.2
@10
F1:0.13333333333333333
P:0.1
R:0.2
@30
F1:0.05714285714285715
P:0.03333333333333333
R:0.2
#########################
