In [None]:
#import modules
import os.path
import nltk
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import pandas as pd

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
class Summarization():
  def preprocess_data(doc_set):
    """Input  : document list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text """    
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts


  def prepare_corpus(doc_clean):
    """ Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix """
    dictionary = corpora.Dictionary(doc_clean) 
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] 
    return dictionary,doc_term_matrix


  def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    """ Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model """
    dictionary,doc_term_matrix=Summarization.prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  #train model #lsi module Implements fast truncated SVD.
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel


  def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """ Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics  """
    
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
      # generate LSA model
      model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
      # id2word (dict of (int, object)) – Mapping id -> word.
      model_list.append(model)
      coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
      coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values


  def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=Summarization.prepare_corpus(doc_clean)
    model_list, coherence_values = Summarization.compute_coherence_values(dictionary, doc_term_matrix,doc_clean, stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

 
  def takenext(elem):
   """ sort """
   return elem[1] 


  def selectTopSent(summSize, numTopics, sortedVec):
   topSentences = []
   sent_no = []
   sentInd = set()
   sCount = 0
   for i in range(summSize):
    for j in range(numTopics):
     vecs = sortedVec[j]
     si = vecs[i][0]
    if si not in sentInd:
        sent_no.append(si)
        topSentences.append(vecs[i])
        sentInd.add(si)
        sCount +=1
        if sCount == summSize:
          return sent_no  