In [None]:
!pip install sumy --quiet
!pip install nltk --quiet

import re
import json
import numpy as np
import pandas as pd

import nltk
from sumy.nlp.tokenizers import Tokenizer                                       # Tokenizer
from sumy.parsers.plaintext import PlaintextParser                              # Pasrer

from sumy.summarizers.luhn import LuhnSummarizer as Luhn                        # Luhn
from sumy.summarizers.lsa import LsaSummarizer as LSA                           # LSA Summarizer
from sumy.summarizers.lex_rank import LexRankSummarizer as LexRank              # Lex-Rank
from sumy.summarizers.text_rank import TextRankSummarizer as TextRank           # Text-Rank
from sumy.summarizers.sum_basic import SumBasicSummarizer as SumBasic           # Sum-Basic
from sumy.summarizers.kl import KLSummarizer as KLSum                           # KL-Sum

from google.colab import drive

nltk.download('punkt')

[K     |████████████████████████████████| 97 kB 5.4 MB/s 
[K     |████████████████████████████████| 10.1 MB 58.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Building wheel for pycountry (PEP 517) ... [?25l[?25hdone


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def get_summaries(dataset, sentence_count):
  '''
  Input  : Dataset
  Process: Extracts Abstracts, Generates Seven Summaries and Append
  Returns: Combined Summaries and All Individual Summaries as Lists
  '''
  # Placeholders
  combined_summaries = []
  luhn_summaries = [] 
  lexrank_summaries = []
  textrank_summaries = []
  sumbasic_summaries = []
  kl_sum_summaries = []
  #edmundson_summaries = []
  #ed_title_summaries = []  

  abstract_list = list(dataset['Abstract'])
  count = 1

  for data in abstract_list:

    # Parse and Tokenize
    parser = PlaintextParser.from_string(data, Tokenizer("english"))

    # Luhn Summarizer
    luhn_text = ''
    luhn = Luhn()
    luhn_summary = luhn(parser.document, sentence_count)
    for sent in luhn_summary:
      luhn_text += str(sent)
    luhn_summaries.append(luhn_text)

    # LexRank Summarizer
    lex_text = ''
    lex = LexRank()
    lex_summary = lex(parser.document, sentence_count)
    for sent in lex_summary:
      lex_text += str(sent)
    lexrank_summaries.append(lex_text)

    # TextRank Summarizer
    textrank_text = ''
    textrank = TextRank()
    textrank_summary = textrank(parser.document, sentence_count)
    for sent in textrank_summary:
      textrank_text += str(sent)
    textrank_summaries.append(textrank_text)

    # SumBasic Summarizer
    sum_basic_text = ''
    sumbasic = SumBasic()
    sumbasic_summary = sumbasic(parser.document, sentence_count)
    for sent in sumbasic_summary:
      sum_basic_text += str(sent)
    sumbasic_summaries.append(sum_basic_text)

    # KLSum Summarizer
    kl_text = ''
    kl = KLSum()
    kl_summary = kl(parser.document, sentence_count)
    for sent in kl_summary:
      kl_text += str(sent)
    kl_sum_summaries.append(kl_text)

    # Concatenation of Summaries
    combined_text = luhn_text + lex_text + textrank_text + sum_basic_text + kl_text
    combined_summaries.append(combined_text)

    # Progress
    if (count%1000 == 0):
      print('Summarization Complete for Abstract ID = {}'.format(count))
    count += 1

  return combined_summaries, luhn_summaries, lexrank_summaries, textrank_summaries, sumbasic_summaries, kl_sum_summaries




# Data Preparation into Pandas Dataframe for Final Summarization Model Input and Other Tasks
def get_data(dataset, combined_summaries, luhn_summaries, lexrank_summaries, textrank_summaries, sumbasic_summaries, kl_sum_summaries, get_excel = True):
  '''
  Generate Dataframe with Title and All Summaries and return final DF and an Excel File
  '''
  title = list(dataset['Title'])

  raw_dataframe = {'Combined Abstract'   : combined_summaries, 
                   'Luhn Summaries'      : luhn_summaries,
                   'LexRank Summaries'   : lexrank_summaries,
                   'TextRank Summaries'  : textrank_summaries,
                   'SumBasic Summaries'  : sumbasic_summaries,
                   'KL Summaries'        : kl_sum_summaries,
                   'Title'               : title}
  df = pd.DataFrame(raw_dataframe, columns = ['Combined Abstract',
                                              'Luhn Summaries',
                                              'LexRank Summaries', 
                                              'TextRank Summaries',
                                              'SumBasic Summaries',
                                              'KL Summaries',
                                              'Title'])
  if (get_excel == True):
    df.to_excel("Summary_Dataset_Complete.xlsx")
    print('Excel File Created and Saved in Local Storage.')
  
  return df

In [None]:
# Parameters
#num_examples   = 10000
sentence_count = 2

# Data Loading
drive.mount('/content/drive')
file = '/content/drive/MyDrive/Title Generation NLP/Dataset/Summary_Dataset_Complete.xlsx' 
df = pd.read_excel(file, names = ['Abstract', 'Domain_Label', 'Title'])
df = df.drop(['Domain_Label'], axis=1)
#df = df[:num_examples]

# Summarization
combines, luhns, lexranks, textranks, sumbasics, kl_sums = get_summaries(df, sentence_count)

# Dataset Preparation (Download Excel from Local Storage of Colab)
final_data_frame = get_data(df, combines, luhns, lexranks, textranks, sumbasics, kl_sums, get_excel = True)
final_data_frame

Mounted at /content/drive
Summarization Complete for Abstract ID = 1000
Summarization Complete for Abstract ID = 2000
Summarization Complete for Abstract ID = 3000
Summarization Complete for Abstract ID = 4000
Summarization Complete for Abstract ID = 5000
Summarization Complete for Abstract ID = 6000
Summarization Complete for Abstract ID = 7000
Summarization Complete for Abstract ID = 8000
Summarization Complete for Abstract ID = 9000
Summarization Complete for Abstract ID = 10000
Summarization Complete for Abstract ID = 11000
Summarization Complete for Abstract ID = 12000
Summarization Complete for Abstract ID = 13000
Summarization Complete for Abstract ID = 14000
Summarization Complete for Abstract ID = 15000
Summarization Complete for Abstract ID = 16000
Summarization Complete for Abstract ID = 17000
Summarization Complete for Abstract ID = 18000
Summarization Complete for Abstract ID = 19000
Summarization Complete for Abstract ID = 20000
Summarization Complete for Abstract ID = 21

Unnamed: 0,Combined Abstract,Luhn Summaries,LexRank Summaries,TextRank Summaries,SumBasic Summaries,KL Summaries,Title
0,We propose an architecture for VQA which utili...,We propose an architecture for VQA which utili...,We propose an architecture for VQA which utili...,We propose an architecture for VQA which utili...,We propose an architecture for VQA which utili...,We propose an architecture for VQA which utili...,Dual Recurrent Attention Units for Visual Que...
1,In this work we present a model based on recur...,In this work we present a model based on recur...,In this work we present a model based on recur...,In this work we present a model based on recur...,In this work we present a model based on recur...,In this work we present a model based on recur...,Sequential Short Text Classification with Rec...
2,We introduce the multiresolution recurrent neu...,We introduce the multiresolution recurrent neu...,We introduce the multiresolution recurrent neu...,We introduce the multiresolution recurrent neu...,We introduce the multiresolution recurrent neu...,We introduce the multiresolution recurrent neu...,Multiresolution Recurrent Neural Networks An ...
3,In Natural Language Processing NLP it is hard ...,In Natural Language Processing NLP it is hard ...,In Natural Language Processing NLP it is hard ...,In Natural Language Processing NLP it is hard ...,In Natural Language Processing NLP it is hard ...,In Natural Language Processing NLP it is hard ...,Learning what to share between loosely relate...
4,The system consists of an ensemble of natural ...,The system consists of an ensemble of natural ...,The system consists of an ensemble of natural ...,The system consists of an ensemble of natural ...,The system consists of an ensemble of natural ...,The system consists of an ensemble of natural ...,A Deep Reinforcement Learning Chatbot
...,...,...,...,...,...,...,...
40995,We study the complexity of learning and approx...,We study the complexity of learning and approx...,We study the complexity of learning and approx...,We study the complexity of learning and approx...,We study the complexity of learning and approx...,We study the complexity of learning and approx...,Nearly Tight Bounds on ell Approximation of S...
40996,We consider the problem of multiple users targ...,We consider the problem of multiple users targ...,We consider the problem of multiple users targ...,We consider the problem of multiple users targ...,We consider the problem of multiple users targ...,We consider the problem of multiple users targ...,Concurrent bandits and cognitive radio networks
40997,In this paper we compare and analyze clusterin...,In this paper we compare and analyze clusterin...,In this paper we compare and analyze clusterin...,In this paper we compare and analyze clusterin...,In this paper we compare and analyze clusterin...,In this paper we compare and analyze clusterin...,A Comparison of Clustering and Missing Data M...
40998,When using CAD there is often a choice for the...,When using CAD there is often a choice for the...,When using CAD there is often a choice for the...,When using CAD there is often a choice for the...,When using CAD there is often a choice for the...,When using CAD there is often a choice for the...,Applying machine learning to the problem of c...
