In [41]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd

In [42]:
df = pd.read_excel("text_data.xlsx")

In [43]:
df.head()

Unnamed: 0,summarytext,alltext
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [44]:
text = df["alltext"].to_list()

In [45]:
print(type(text))

<class 'list'>


preprocessing text

In [46]:
article_content = ''

def create_dictionary_table(text_string) -> dict:
   

    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    #reducing words to their root form
    stem = PorterStemmer()
    
    #creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table



calculate the sentence score values

In [47]:

def calculate_sentence_scores(sentences, frequency_table) -> dict:   

    #algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount_without_stop_words

    return sentence_weight


calculate the avarage scores

In [48]:

def calculate_average_score(sentence_weight) -> int:
   
    #calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    #getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score


generate summary

In [49]:

def get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary


In [50]:

def run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = get_article_summary(sentences, sentence_scores, 1.5 * threshold)

    return article_summary


In [51]:

if __name__ == '__main__':
    df['output'] = df['alltext'].apply(lambda x: run_article_summary(str(x)))

In [52]:
df.head()

Unnamed: 0,summarytext,alltext,output
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...,?The circular is ridiculous.
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo...","But then, she got divorced and alimony is the..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...,with ?unmarried? or ?kunwari? ? ?The word ?vi...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Dujana is heard asking the officer. Congratul...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...,


In [53]:
df.to_csv("output.csv", index=False)