### Preprocessing

In [4]:
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords
nltk.download('stopwords') 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/student/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [17]:
def count_sentences(text):
    """Count the number of sentences in a given text.

    Parameters:
    text (str): The text to be analyzed.

    Returns:
    int: The number of sentences in the text.
    """
    sentences = sent_tokenize(text)
    return len(sentences)


In [18]:
import re

In [19]:
def clean_text_sentence_level(text):
    """Clean text at the sentence level.

    This function takes a text as input and performs the following steps:
    - Tokenize the text into sentences
    - Remove URLs, dates, times, and email addresses from the text
    - Clean each sentence separately, excluding currency symbols
    - Apply the provided preprocessing, stop word removal, sentence processing, and lemmatization functions
    - Filter out invalid sentences

    Parameters:
    text (str): The text to be cleaned.

    Returns:
    tuple: A tuple of three lists: cleaned_sentences, cleanOriginal, and sentence_ids.
    - cleaned_sentences: A list of cleaned sentences as strings.
    - cleanOriginal: A list of original sentences as strings.
    - sentence_ids: A list of sentence ids as integers.
    """

    # Tokenize the text into sentences
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'www\.\S+', '', text)
    text = re.sub(r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b', '', text)

    # Remove times
    text = re.sub(r'\b\d{1,2}:\d{1,2}(:\d{1,2})?\b', '', text)

    # Remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    sentences = sent_tokenize(text)
    cleanOriginal = []
    sentence_ids = []
    # Clean each sentence separately, excluding currency symbols
    cleaned_sentences = []

    for sentence_id,sentence in enumerate(sentences):
        # cleanOriginal.append(sentence)
        lines = sentence.split('\n')
        cleaned_lines = [re.sub(r"[^a-zA-Z0-9?!.,’$£¥₹%\-]", ' ', line) for line in lines]
        cleaned_sentence = ' '.join(cleaned_lines)
        cleaned_sentence = re.sub(r"([?!])", r" \1 ", cleaned_sentence)
        cleaned_sentence = re.sub(r'([a-z])(?=[.,])', r'\1 ', cleaned_sentence)
        cleaned_sentence = re.sub(r"\s+", " ", cleaned_sentence)
        
        cleanOriginal.append(cleaned_sentence)

        # Apply the provided preprocessing function
        cleaned_sentence = preprocessing(cleaned_sentence)
        cleaned_sentence = remove_stop_words(cleaned_sentence)
        cleaned_sentence = process_sentence(cleaned_sentence)

        # Lemmatize
        cleaned_sentence = lemmatize_text(cleaned_sentence)

        if is_valid_sentence(cleaned_sentence):
            cleaned_sentences.append(cleaned_sentence)
            sentence_ids.append(sentence_id)

    # cleaned_text = '. '.join(cleaned_sentences) +'.'
    # cleaned_text = ' '.join(cleaned_sentences)
    return cleaned_sentences,cleanOriginal,sentence_ids


In [20]:
def process_sentence(sentence):
    """Process a sentence and return a cleaned version.

    This function takes a sentence as input and performs the following steps:
    - Tokenize the sentence into words
    - Tag the words with their part-of-speech (POS) tags
    - Filter out the words that are not alphanumeric, are in stop words, or have an unallowed POS tag
    - Lemmatize the words to their base form

    Parameters:
    sentence (str): The sentence to be processed.

    Returns:
    str: The cleaned sentence as a string of words separated by spaces.
    """
    words = word_tokenize(sentence)
    tagged_words = pos_tag(words)

    clean_words = []
    stop_words = set(stopwords.words('english'))
    allowed_tags = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

    for word, tag in tagged_words:
        word_lemma = lemmatizer.lemmatize(word.lower())

        # Check if the word is alphanumeric, not in stop words, and has an allowed POS tag
        if word_lemma.isalnum() and word_lemma not in stop_words and tag in allowed_tags:
            clean_words.append(word_lemma)

    return ' '.join(clean_words)

In [21]:
def remove_stop_words(text):
    """Remove stop words from a text.

    This function takes a text as input and performs the following steps:
    - Create a set of stop words using the nltk library
    - Tokenize the text into words
    - Filter out the words that are in the stop words set
    - Join the filtered words into a string

    Parameters:
    text (str): The text to be processed.

    Returns:
    str: The text without stop words as a string of words separated by spaces.
    """
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [22]:
def preprocessing(sentence): 
    """Preprocess a sentence and return a cleaned version.

    This function takes a sentence as input and performs the following steps:
    - Convert the sentence to lowercase
    - Replace the currency symbols with their names


    Parameters:
    sentence (str): The sentence to be preprocessed.

    Returns:
    str: The preprocessed sentence as a string.
    """
    sentence = sentence.lower()
    sentence = re.sub("[$]", "dollar ", sentence)
    sentence = re.sub("[£]", "pound ", sentence)
    sentence = re.sub("[%]", " percent", sentence)

    return sentence

In [23]:
def lemmatize_text(text):
    """Lemmatize a text using WordNetLemmatizer from nltk library.

    Args:
        text (str): The text to be lemmatized.

    Returns:
        str: The lemmatized text.
    """
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

In [24]:
def is_valid_sentence(sentence):
    """Check if a sentence is valid for text analysis.

    A sentence is considered valid if it has more than one word and less than 40% numerical content.

    Args:
        sentence (str): The sentence to be checked.

    Returns:
        bool: True if the sentence is valid, False otherwise.
    """
    words = word_tokenize(sentence)
    
    # Exclude sentences with one word
    if len(words) <= 1:
        return False
    
    # Calculate the percentage of numerical content
    numerical_percentage = sum(c.isdigit() for c in sentence) / len(sentence)
    
    # Exclude sentences with at least 40% numerical content
    if numerical_percentage >= 0.4:
        return False
    
    return True


### Importing Sample Text for testing

In [25]:
sample = 'training/annual_reports/64.txt'
file_path = 'training/annual_reports/64.txt'  # Replace with the actual file path

try:
    with open(file_path, 'r') as file:
        content = file.read()
        # Now you can use the 'content' variable for further processing
       
        
        # Further processing example (replace this with your logic)
        # For instance, count the number of lines in the file
        num_lines = len(content.split('\n'))
        print(f"Number of lines in the file: {num_lines}")

except FileNotFoundError:
    print(f"File not found at path: {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")



Number of lines in the file: 2356


In [26]:
#sample output
stext_data,soriginal_sentence,smapping = clean_text_sentence_level(content)

In [27]:
print(stext_data)

['vphase plc group statement year ended vphase plc group statement year ended vphase plc technology park chester ch1 e voltage management vphase home built england vphase plc specialises voltage management developed technology platform range product reduce energy used home business', 'plc subsidiary energetix group plc listed', 'content page highlight glance chairman statement executive review finance review board director group director report report auditor group income statement group statement change equity group balance sheet group cash flow statement note group statement company balance sheet note company balance sheet director secretary advisor group fsc logo go vphase electricity generation account total carbon emission energy paper curre r energy tre r nd weo home built zero carbon dti world energy demand ex e pands percent weo vphase plc group statement year ended highlight highlight placing share raising pound letter intent signed energy sse fund demonstration action voltage

### Summarization Function

In [28]:
#  pip install rank_bm25

In [29]:
import numpy as np
import networkx as nx
from gensim import corpora
from rank_bm25 import BM25Plus


from sklearn.feature_extraction.text import TfidfVectorizer

from math import log


In [42]:


def similarity_original(text_data):
    """Compute the similarity matrix between sentences in a text.

    The similarity is defined as the ratio of common words to the logarithm of sentence lengths.

    Args:
        text_data (list of list of str): A list of sentences, where each sentence is a list of words.

    Returns:
        numpy.ndarray: A square matrix of shape (len(text_data), len(text_data)), where each element (i, j) is the similarity between sentence i and sentence j.
    """
    sim = np.zeros([len(text_data),len(text_data)]) # Initialization
    for i, sentence_1 in enumerate(text_data):
        for j, sentence_2 in enumerate(text_data):
            sent_1 = set(sentence_1) # Unique words
            sent_2 = set(sentence_2)
            if(i == j):
                sim[i][j] = 0
            else:
                common = float(len(list(sent_1 & sent_2)))
                if(len(sentence_1) and len(sentence_2) > 1):
                    denominator = float(log(len(sentence_1)) + log(len(sentence_2)))
                else:
                    denominator = 1.0
                
                sim[i][j] = common / denominator
    # sim =0
    # for i, sentence_1 in enumerate(text_data):
    #     for j, sentence_2 in enumerate(text_data):
    #          sim = sim +1
    return sim





def similarity_bm(sentence):
    """Compute the similarity matrix between sentences in a text using BM25.

    BM25 is a ranking function that assigns a score to a document based on how relevant it is to a query. It is based on the bag-of-words model and uses the term frequency and inverse document frequency of the words in the document and the query.

    Args:
        sentence (list of str): A list of sentences to be compared.

    Returns:
        numpy.ndarray: A square matrix of shape (len(sentence), len(sentence)), where each element (i, j) is the BM25 score between sentence i and sentence j.
    """
    dics = []
    for sent in sentence:
        dics.append(word_tokenize(sent)) 
    dictionary = corpora.Dictionary(dics) # BAG_OF_WORDS MODEL
    corpus = [dictionary.doc2bow(doc) for doc in dics]

    # # sample printing of bag of world
    # print("Bag-of-Words Representation Sample :")
    # for i, doc in enumerate(corpus):
    #     if(i==2):
    #         break
    #     print(f"Document {i + 1}: {doc}")
    #     # Print the word for each word id
    #     for word_id, word_count in doc:
    #         print(f"{dictionary.get(word_id)}: {word_count}")
    #     print("-------------------------------------------------------------------------------------------------")
    bm25_obj = BM25Plus(corpus)

    similarity = []
    # print("---------------------------------------------------------------------------------------------------")
    for i, sent in enumerate(sentence):
        word = word_tokenize(sent)
        query = dictionary.doc2bow(word)
        
        # if(i<=0):
        #     print("\n query sample:")
        #     print(query)
        score = bm25_obj.get_scores(query)
        
        # if(i<=0):
        #     print("\n sample score of a query:")
        #     print(score)
        #     print("---------")
        similarity.append(score)
    
    sim = np.array(similarity)    
    return sim


def generate_similarity_matrix(text):
    """Generate a similarity matrix between sentences in a text using two methods.

    The first method is based on the ratio of common words to the logarithm of sentence lengths.
    The second method is based on the BM25 ranking function that uses term frequency and inverse document frequency.
    The final similarity matrix is the average of the normalized matrices from the two methods.

    Args:
        text (list of list of str): A list of sentences, where each sentence is a list of words.

    Returns:
        numpy.ndarray: A square matrix of shape (len(text), len(text)), where each element (i, j) is the similarity between sentence i and sentence j.
    """
    first_matrix = similarity_original(text)
    second_matrix = similarity_bm(text)
    # print(first_matrix.shape)
    # print(second_matrix.shape)
    # Normalization
    first_matrix = first_matrix / first_matrix.max()
    second_matrix = second_matrix / second_matrix.max()
    
    return (first_matrix + second_matrix)



def summarize (text):
    """
    Summarizes a given text using similarity matrix and PageRank algorithm.

    Parameters:
    text (str): The text to be summarized.

    Returns:
    str: The summary of the text.
    """
    # Similarity Matrix
    # sentences = sent_tokenize(text)
    text_data,original_sentence,mapping = clean_text_sentence_level(text)
    # text_data = sent_tokenize(text_data)
    similarity_matrix = generate_similarity_matrix(text_data)
    
    # Page Rank
    nx_graph = nx.from_numpy_array(similarity_matrix)
    max_iter = len(text_data)
    # print(max_iter)
    scores = nx.pagerank(nx_graph, max_iter=max_iter)   # use max_iter = number of sentence . for graph conver
    ratio = 0.05
    # Best sentences
    # i =0
    # for index,sentence in enumerate(text_data):
    #     i = index
    # print(i)
    top_sentence = {sentence:scores[index] for index,sentence in enumerate(text_data)}
    number = int(len(original_sentence)*(ratio))
    top = dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:number])
    
    text_list = []
    values_list = list(top.keys())

    # 
    map_index =[]
    for i in values_list :
        index = text_data.index(i)
        map_index.append(mapping[index])

    map_index.sort()
    for i in map_index:
        text_list.append(original_sentence[i])
    
    

    summary = " ".join(text_list)
    return summary

In [41]:
# sample output 
ssimilarity_matrix = generate_similarity_matrix(stext_data)
print(ssimilarity_matrix)

Bag-of-Words Representation Sample :
Document 1: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 2), (10, 2), (11, 2), (12, 1), (13, 1), (14, 4), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 2), (21, 1), (22, 2), (23, 5), (24, 2)]
built: 1
business: 1
ch1: 1
chester: 1
developed: 1
e: 1
ended: 2
energy: 1
england: 1
group: 2
home: 2
management: 2
park: 1
platform: 1
plc: 4
product: 1
range: 1
reduce: 1
specialises: 1
statement: 2
technology: 2
used: 1
voltage: 2
vphase: 5
year: 2
-------------------------------------------------------------------------------------------------
Document 2: [(9, 1), (14, 2), (25, 1), (26, 1), (27, 1)]
group: 1
plc: 2
energetix: 1
listed: 1
subsidiary: 1
-------------------------------------------------------------------------------------------------

 query sample:
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 2), (10, 2), (11, 2), (12, 1), (13, 1), (14, 4), (15, 1), (16, 1), (17, 1), (18, 1

### Importing Annual Reports and Summaries

In [27]:
import os
import pandas as pd

# directory where your text files are located
text_path = 'validation/annual_reports/'
label_path = 'validation/gold_summaries/'
output_path = 'validation/generated_summaries/'

fileid = []
texts = []
labels = []
gen_sum =[]
count =0
for filename in os.listdir(text_path):
    if filename.endswith(".txt"):
        filepath = os.path.join(text_path, filename)
        
        # Check if the file is a text file
        if os.path.isfile(filepath):
            with open(filepath, 'r', encoding='utf-8') as file:
                # Read the content of the file
                text = file.read()

                summary = summarize(text)
                gen_sum.append(summary)

                # Create the output directory if it doesn't exist
                os.makedirs(output_path, exist_ok=True)
                output_filename = filename.replace('.txt','_gensum.txt')
                output_filepath = os.path.join(output_path, output_filename)
                with open(output_filepath, 'w', encoding='utf-8') as output_file:
                    for sentence in summary:
                        output_file.write(str(sentence))

                count = count+1
                print(f"{count}, ", end="")
                
                fileid.append(filename)
                # Append data to lists
                texts.append(text)
     
                label_filename = filename.replace('.txt', '_1.txt')
                label_filepath = os.path.join(label_path, label_filename)

                if os.path.isfile(label_filepath):
                    with open(label_filepath, 'r', encoding='utf-8') as label_file:
                        # Read the content of the label file
                        label = label_file.read().strip()

                        # Append data to the labels list
                        labels.append(label)
                else:
                    # If label file does not exist, append None
                    labels.append(None)


# Create a DataFrame
data = {'File_Name': fileid,'AR_Report': texts, 'Summary': labels,'My_Summary': gen_sum}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222

### case 2 : applying some preprocessing to gold summaries before calculating rouge score

In [8]:
def clean_text(text):
    """
    Cleans a given text by removing unwanted characters and adding spaces.

    Parameters:
    text (str): The text to be cleaned.

    Returns:
    str: The cleaned text.
    """
    # Remove characters that are not alphanumeric or specific punctuation
    cleaned_lines = [re.sub(r"[^a-zA-Z0-9?!.,’$£¥₹%\-]", ' ', line) for line in text.split('\n')]
    
    # Join cleaned lines into a single string, separating them with a space
    cleaned_sentence = ' '.join(cleaned_lines)
    
    # Add spaces around question marks and exclamation marks
    cleaned_sentence = re.sub(r"([?!])", r" \1 ", cleaned_sentence)
    
    # Add a space after lowercase letters if they are followed by a period or comma
    cleaned_sentence = re.sub(r'([a-z])(?=[.,])', r'\1 ', cleaned_sentence)
    
    # Replace multiple consecutive spaces with a single space
    cleaned_sentence = re.sub(r"\s+", " ", cleaned_sentence)
    
    # Remove leading and trailing whitespaces
    cleaned_sentence = cleaned_sentence.strip()
    
    return cleaned_sentence

In [None]:
import os
def process_text_file(file_path):
    """
    Process a text file by converting its content to uppercase.

    Parameters:
    - file_path (str): Path to the text file.

    Returns:
    - str: Processed text content.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Apply your processing function here, for example, converting to uppercase
    processed_content = clean_text(content)

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(processed_content)

    return processed_content

def process_files_in_folder(folder_path):
    """
    Process all text files in a folder.

    Parameters:
    - folder_path (str): Path to the folder containing text files.
    """
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            processed_content = process_text_file(file_path)
            # print(f"Processed {filename}: {processed_content[:50]}...")

# Example usage:
folder_path = 'validation/gold_summaries'
process_files_in_folder(folder_path)

### Rouge Score Calculations by python library

In [32]:
import math

In [31]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
# from rouge import Rouge
# from nltk import ngrams
# from nltk.metrics import precision, recall, f_measure

# def get_ngrams(text, n):
#     tokens = text.split()
#     return list(ngrams(tokens, n))

# def rouge_n(reference, system, n):
#     reference_ngrams = get_ngrams(reference, n)
#     system_ngrams = get_ngrams(system, n)

#     reference_set = set(reference_ngrams)
#     system_set = set(system_ngrams)

#     precision_score = precision(reference_set, system_set)
#     recall_score = recall(reference_set, system_set)
#     f1_score = f_measure(reference_set, system_set)

#     return precision_score, recall_score, f1_score

# def rouge_scores(reference, system):
#     scores = {}
#     for n in range(1, 2):  # calculating ROUGE-1, ROUGE-2, and ROUGE-3
#         scores[f'ROUGE-{n}'] = rouge_n(reference, system, n)
#     return scores

# def calculate_rouge_l(reference, system):
#     rouge = Rouge()
#     scores = rouge.get_scores(system, reference)
#     rouge_l_score = scores[0]['rouge-l']['f']
#     return rouge_l_score





# scores = rouge_scores(reference_text, system_text)

# for key, value in scores.items():
#     precision_score, recall_score, f1_score = value
#     print(f'{key}: Precision = {precision_score:.2f}, Recall = {recall_score:.2f}, F1 = {f1_score:.2f}')

    
# rouge_l_score = calculate_rouge_l(reference_text, system_text)
# print(f"ROUGE-L Score: {rouge_l_score}")

### Naming the text files according to given convictions

In [2]:
# import os

# def change_filenames_in_folder(folder_path, new_filename_prefix,suffix):
#     """
#     Change the filenames of all text files in a folder.

#     Parameters:
#     - folder_path (str): The path to the folder containing text files.
#     - new_filename_prefix (str): The new filename prefix.

#     Returns:
#     - list: List of paths to the renamed files.
#     """
#     renamed_file_paths = []

#     for filename in os.listdir(folder_path):
#         if filename.endswith('.txt'):
#             # Construct the full path to the file
#             file_path = os.path.join(folder_path, filename)

#             # Get the directory, original filename, and extension of the file
#             directory, original_filename = os.path.split(file_path)
#             _, extension = os.path.splitext(original_filename)

#             # Extract the numerical part before underscore
#             numeric_part = original_filename.split('_')[0]

#             # Create the new filename with the specified prefix and original extension
#             # new_filename = f"{new_filename_prefix}{numeric_part}_{suffix}{(original_filename.split('_')[1]).replace('.txt', '')}"
#             new_filename = f"{new_filename_prefix}{numeric_part}_{suffix}1{(original_filename.split('_')[1]).replace('gensum.txt', '')}"
#             new_path = os.path.join(directory, f"{new_filename}{extension}") 

#             # Rename the file
#             os.rename(file_path, new_path)
#     #         renamed_file_paths.append(new_path)

#     # return renamed_file_paths

# # # # Example usage:
# # # folder_path1 = 'score/gold_summaries/'
# # # prefix = 'fns'
# # # suffix1 = 'reference'
# # # change_filenames_in_folder(folder_path1,prefix,suffix1)


# folder_path1 = 'generated_summaries4/'
# prefix = 'fns'
# suffix1 = 'system'
# change_filenames_in_folder(folder_path1,prefix,suffix1)

### Preprocessing of text file for Rouge.jar according to given convictions

In [8]:
def add_newline_after_sentence(text):
    sentences = nltk.sent_tokenize(text)
    modified_text = "\n".join(sentences)
    return modified_text


# folder_path = "generated_summaries4/"
def preprocess_for_jar(folder_path):
# Iterate over each file in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is a text file
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            
            # Read the content of the file
            with open(file_path, "r") as file:
                text = file.read()
            
            # Apply the function to the text
            modified_text = add_newline_after_sentence(text)
            
            # Write the modified text back to the file
            with open(file_path, "w") as file:
                file.write(modified_text)

In [10]:
folder_path = "generated_summaries4/"
preprocess_for_jar(folder_path)

### Calculating Average Scores of CSV file generated by Rouge.jar

In [11]:
import pandas as pd

def calculate_average(csvpath):
    # Read the csv file into a dataframe
    df = pd.read_csv(csvpath)
    df = df.dropna(how='all')
    selected_columns = df[["ROUGE-Type", "Avg_Recall", "Avg_Precision", "Avg_F-Score"]]
    grouped = selected_columns.groupby("ROUGE-Type")[["Avg_Recall", "Avg_Precision", "Avg_F-Score"]].mean()
    return grouped

#### case 1: Results without applying some preprocessing to gold summaries

In [12]:
csv_path = "results.csv"
avg_result = calculate_average(csv_path)
print(avg_result)

                           Avg_Recall  Avg_Precision  Avg_F-Score
ROUGE-Type                                                       
ROUGE-1+StopWordRemoval      0.709806       0.098250     0.166995
ROUGE-2+StopWordRemoval      0.328381       0.034543     0.060522
ROUGE-L+StopWordRemoval      0.565320       0.155570     0.238432
ROUGE-SU4+StopWordRemoval    0.454828       0.042447     0.075417


#### case 2: Results with applying some preprocessing to gold summaries

In [45]:
csv_path = "results2.csv"
avg_result = calculate_average(csv_path)
print(avg_result)

                           Avg_Recall  Avg_Precision  Avg_F-Score
ROUGE-Type                                                       
ROUGE-1+StopWordRemoval      0.737630       0.100463     0.171193
ROUGE-2+StopWordRemoval      0.333264       0.040409     0.069644
ROUGE-L+StopWordRemoval      0.545502       0.145125     0.223913
ROUGE-SU4+StopWordRemoval    0.428033       0.053061     0.091327
