# ISSS609-Text Analytics and Applications Project

There will be a list of workbooks for the purpose of this project.

1) Data_Preparation_and_Topic_Modelling

2) Abstractive_Summarisation

3) Extractive_Summarisation_Average_Score

4) Extractive_Summarisation_TFIDF

5) UI_Preparation

### This workbook will cover (4)  Extraction-based summarization based on TF-IDF algorithm.
Reference: https://github.com/akashp1712/nlp-akash/blob/master/text-summarization/summarize3.py"""

In [2]:
import math
import nltk
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import PlaintextCorpusReader
import re
import pandas as pd

In [3]:
## Load text file and create a PlaintextCorpusReader object
file_directory = 'Raw Data/News_content/'
filename_pattern = '.+\.txt'
my_corpus = PlaintextCorpusReader(file_directory, filename_pattern)



In [4]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [5]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [6]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [7]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [8]:
def _create_final_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [9]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """
    # TODO: Can you make this multiprocess compatible in python?

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [10]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [11]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    # TODO: check if the sentences in the summarization is in the original order of occurrence.

    return summary

In [12]:
import os
os.chdir('Raw Data/News_content')
print (os.getcwd())

/Users/yaoyu/Desktop/python/project/News_content


In [13]:
def preprocess_sentence(text, keep_most=False):
    """
    Helper function to remove html, unneccessary spaces and punctuation.
    Args:
        text: String.
        keep_most: Boolean. depending if True or False, we either
                   keep only letters and numbers or also other characters.

    Returns:
        processed text.

    """
    text = text.lower()
    text = fixup(text)
    text = re.sub(r"<br />", " ", text)
    if keep_most:
        text = re.sub(r"[^a-z0-9%!?.,:()/]", " ", text)
    else:
        text = re.sub(r"[^a-z0-9]", " ", text)
    text = re.sub(r"    ", " ", text)
    text = re.sub(r"   ", " ", text)
    text = re.sub(r"  ", " ", text)
    text = text.strip()
    return text


def fixup(x):
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [14]:
fids = my_corpus.fileids()
result=[]
for i in fids:
    with open(i,'r') as file:
        data = file.read().replace('\n', '')
        data = re.sub(r'(?<=\.)[^.]*$', "",data)
        sentences = sent_tokenize(data)
        total_documents = len(sentences)
        freq_matrix = _create_frequency_matrix(sentences)
        tf_matrix = _create_tf_matrix(freq_matrix)
        count_doc_per_words = _create_documents_per_words(freq_matrix)
        idf_matrix=_create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
        tf_idf_matrix = _create_final_matrix(tf_matrix, idf_matrix)
        sentence_scores = _score_sentences(tf_idf_matrix)
        threshold = _find_average_score(sentence_scores)
        summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
        Summary=i+summary
        result.append(Summary)
        
       

In [15]:
df=pd.DataFrame()
df["summary"]=result

In [16]:
NO=[]
SU=[]
for h in df["summary"]:
    no=h.split(" ",1)[0]
    su=h.split(" ",1)[1:]
    NO.append(no)
    SU.append(su)
df["NO."]=NO
df["summary"]=SU

In [17]:
columnsTitles = ['NO.', 'summary']

df = df.reindex(columns=columnsTitles)

In [18]:
df

Unnamed: 0,NO.,summary
0,145607.txt,[Constant whispers of the US leaving or defund...
1,145609.txt,"[Imagine the scenario. Ludicrous, right? It ce..."
2,145610.txt,[The Guardian US Politics Minute catches you u...
3,145611.txt,[” “To me the kangaroos look like Mr and Mrs M...
4,145612.txt,[It’s no wonder a number of clubs are interest...
5,145613.txt,[It’s hard to think of a situation in which it...
6,145615.txt,[” He said he had already killed several peop...
7,145616.txt,[British planes were among those operating in ...
8,145618.txt,[He had campaigned for LGBT rights within the ...
9,145619.txt,[Court officials say they cannot find addition...


In [19]:
df.to_csv("4a. TFIDF_Summary.csv")