In [49]:
#create a class to encapsulate all the cleaning functions

class Cleaner:
    '''
    This class contains functions to clean the input data provided (ideally a text file)
    '''
    
    
    def __init__(self, text_file):
        '''
        This is the init constructor function that will initialize the input text data to that particular
        instance of the class.
        
        args:
        
        text_file = the input string data.
        '''
        
        self.text_file = text_file
        
        
    def punctuation_removal(self, sentences):
        '''
        Function to remove punctuation and special characters from the raw data.
        '''
        
        import pandas as pd
        clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
        return clean_sentences
        
        
        
#     def stopword_removal(self, text_file):
#         '''
#         Function to remove stopwords from the input raw data.
#         '''
        
#         print('able to access stopword_removal function and run it')
        
        
    def tokenize_sentences(self, text_file):
        '''
        Function to tokenize the given paragraph to sentences.
        '''
        
        from nltk.tokenize import sent_tokenize
        sentences = []
        for s in text_file:
            sentences.append(sent_tokenize(s))

        sentences = [y for x in sentences for y in x]
        return sentences
        
        
    def extract_feature(self, clean_sentences):
        from sklearn.feature_extraction.text import TfidfVectorizer
        tf_idf_vec = TfidfVectorizer(use_idf=True, 
                        smooth_idf=False,  
                        ngram_range=(1,1),stop_words='english')
        tf_idf_matrix = tf_idf_vec.fit_transform(clean_sentences)
        return tf_idf_matrix

    
    def cosine_similarity(self, tf_idf_matrix):
        from sklearn.metrics.pairwise import cosine_similarity
        import numpy as np
        sim_mat = np.zeros([len(clean_sentences), len(clean_sentences)])
        cosine_sim = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    sim_mat[i][j] = cosine_sim[i][j]
        return sim_mat
    
    
    def page_ranking(self, sim_mat):
        import networkx as nx

        nx_graph = nx.from_numpy_array(sim_mat)
        scores = nx.pagerank(nx_graph)
        return scores
        
        
    def show_summary(self, scores):
        ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(clean_sentences)), reverse=True)
        for i in range(2):
            print(ranked_sentences[i][1])
  
        
    def run_summarizer(self, text_file):
        self.tokenize_sentences(text_file)
        self.punctuation_removal(text_file)
        self.extract_feature(text_file)
        self.cosine_similarity(text_file)
        #self.page_ranking(similarity_matrix)
        #self.show_summary(ranked_sentences)
        print('All functions run finished!')

In [50]:
text = ['\n\nthe fresh initiative by the union telecommunications ministry to tackle pesky calls is welcome. cracking down on such unwelcome calls, which are a source of great irritation to telecom subscribers, comes after earlier attempts like setting up the do not disturb registry have been bypassed by telemarketers. with a large army of educated, unemployed youth cheaply feeding into the staffing needs of telemarketers, the number of pesky calls have multiplied manifold. in the absence of monetary penalties and other regulatory deterrence, there was no incentive on the part of telemarketers to change their ways.\n\nthe digital intelligence unit to investigate mobile frauds will also have its hands full. for one, the linkages between telemarketers and those fraudulently peddling financial products has to be more thoroughly investigated. the recent crackdown on mobile lending apps revealed that these used fully equipped call centres to act as collection agents to force debtors to repay.\nread also: govt makes fresh bid to rein in pesky callers, phone frauds\non top of this there is today easy access to personal data, which makes it simple for telemarketers to commit financial fraud on gullible consumers, especially the elderly. a data protection law to prevent the leakage of consumer telecom and financial information in the possession of telecom and banking companies is also necessary.the new digital intelligence unit must effectively invoke penalties and choke telecom resources to those abusing teleconnectivity. otherwise, there will be no end to this menace.']

In [51]:
raw_data = ' '.join(text)

In [52]:
summary = Cleaner(raw_data)

In [53]:
sentences = summary.tokenize_sentences(text)
summary.punctuation_removal(sentences)

0      the fresh initiative by the union telecommun...
1    cracking down on such unwelcome calls  which a...
2    with a large army of educated  unemployed yout...
3    in the absence of monetary penalties and other...
4    the digital intelligence unit to investigate m...
5    for one  the linkages between telemarketers an...
6    the recent crackdown on mobile lending apps re...
7    read also  govt makes fresh bid to rein in pes...
8    a data protection law to prevent the leakage o...
9      otherwise  there will be no end to this menace 
dtype: object

In [None]:
def run_summarizer():
        summary.tokenize_sentences(text_file)
        summary.punctuation_removal(text_file)
        summary.extract_feature(text_file)
        summary.cosine_similarity(text_file)
        summary.page_ranking(similarity_matrix)
        print('All functions run finished!')