# **Import necessary** **libraries**

In [1]:
import os  # Used for interacting with the operating system
from nltk.tokenize import sent_tokenize, word_tokenize  # Tokenization functions from NLTK
from nltk.corpus import stopwords  # Stopwords from NLTK for text processing
import pandas as pd  # Pandas for data manipulation and analysis
import numpy as np  # NumPy for numerical operations
import math  # Math library for mathematical functions
import re

# **Mount Google Drive**

In [2]:
from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
def lexrank_summarizer(file_path):
    file = open(file_path, 'r')
    text = file.read()

    text = text.strip().replace('\n', ' ')

    # Tokenize the text into sentences
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'www\.\S+', '', text)
    text = re.sub(r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b', '', text)

    # Remove times
    text = re.sub(r'\b\d{1,2}:\d{1,2}(:\d{1,2})?\b', '', text)

    # Remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

    sentences = sent_tokenize(text)

    stop_words = set(stopwords.words('english'))

    def token_lower(sentence):
        # Convert the sentence to lowercase
        sentence = sentence.lower()

        # Replace '$' with 'dollar'
        sentence = re.sub("[$]", "dollar ", sentence)

        # Replace '£' with 'pound'
        sentence = re.sub("[£]", "pound ", sentence)

        # Replace '%' with 'percent'
        sentence = re.sub("[%]", " percent", sentence)

        # Remove characters other than alphanumeric, '?', '!', '.', ',', "'", and '-'
        sentence = re.sub(r"[^a-zA-Z0-9?!.,’-]", ' ', sentence)

        # Add spaces around '?', '!', and '.'
        sentence = re.sub(r"([?!])", r" \1 ", sentence)

        # Add space after a lowercase letter followed by '.', ','
        sentence = re.sub(r'([a-z])(?=[.,])', r'\1 ', sentence)

        # Replace multiple spaces with a single space
        sentence = re.sub(r"\s+", " ", sentence)

        # Add a full stop at the end of the sentence
        sentence += "."

        # Tokenize the sentence using NLTK's word_tokenize
        words = word_tokenize(sentence)

        # Filter out stopwords and non-alphabetic words, and convert to lowercase
        filtered_words = [word.lower() for word in words if (word not in stop_words and word.isalpha())]

        return filtered_words

    tok_fil_sent = list(map(token_lower, sentences))
    num_nodes = len(tok_fil_sent)

    def idf(tok_fil_sent):
        word_idf = {}
        sent_set = []
        words = set()
        num_sent = len(tok_fil_sent)
        for i in range(num_sent):
            sent_set += [set(tok_fil_sent[i])]
            words |= sent_set[i]
        words = list(words)
        for word in words:
            word_idf[word] = math.log(float(num_sent) / sum([1 for i in range(num_sent) if word in sent_set[i]]))
        return word_idf

    word_idf = idf(tok_fil_sent)

    def idf_mod_cos(sent1, sent2, word_idf):
        sent1_dict = {}
        sent2_dict = {}
        for word in sent1:
            if word in sent1_dict:
                sent1_dict[word] += 1
            else:
                sent1_dict[word] = 1
        for word in sent2:
            if word in sent2_dict:
                sent2_dict[word] += 1
            else:
                sent2_dict[word] = 1

        common_words = set(sent1_dict.keys()) & set(sent2_dict.keys())

        if not common_words:
            return 0.0

        numerator = sum(sent1_dict[word] * sent2_dict[word] * word_idf[word] * word_idf[word] for word in common_words)

        sum_squared_sent1 = sum((sent1_dict[word] * word_idf[word]) ** 2 for word in sent1_dict)
        sum_squared_sent2 = sum((sent2_dict[word] * word_idf[word]) ** 2 for word in sent2_dict)

        denominator = math.sqrt(sum_squared_sent1) * math.sqrt(sum_squared_sent2)

        if denominator == 0.0:
            return 0.0

        similarity = numerator / denominator

        return similarity

    graph = np.zeros((num_nodes, num_nodes))
    for i in range(num_nodes):
        for j in range(i + 1, num_nodes):
            graph[i, j] = idf_mod_cos(tok_fil_sent[i], tok_fil_sent[j], word_idf)
            graph[j, i] = graph[i, j]

    node_weights = np.ones(num_nodes)

    def text_rank_sent(graph, node_weights, d=0.85, iter=20):
        weight_sum = np.sum(graph, axis=0)
        while iter > 0:
            for i in range(len(node_weights)):
                temp = 0.0
                for j in range(len(node_weights)):
                    temp += graph[i, j] * node_weights[j] / weight_sum[j]
                node_weights[i] = 1 - d + (d * temp)
            iter -= 1

    text_rank_sent(graph, node_weights)

    top_k = 10  # assuming you want to print the top 10 sentences
    top_index = [i for i, j in sorted(enumerate(node_weights), key=lambda x: x[1], reverse=True)[:top_k]]
    top_sentences = [sentences[i] for i in top_index]

    # Check if there are top sentences to join
    if top_sentences:
        # Join top sentences with a single space and add a period at the end
        ext_summary = " ".join(top_sentences) + "."
    else:
        # Handle the case where there are no top sentences
        ext_summary = ""

    return ext_summary

In [3]:
# Specify the folder paths
annual_report_folder = 'fns2020_dataset/validation/annual_reports'
output_folder = 'output_summaries'

In [4]:
# Ensure the output folder exists, create if not
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Process each file in the annual report folder
for file_name in os.listdir(annual_report_folder):  # Adjust the limit as needed
    file_path = os.path.join(annual_report_folder, file_name)
    summary = lexrank_summarizer(file_path)

    # Write the summary to an output file
    output_file_path = os.path.join(output_folder, f'summary_{file_name}')
    with open(output_file_path, 'w') as output_file:
        output_file.write(summary)

    print(f'Summary for {file_name} written to {output_file_path}')

  temp += graph[i, j] * node_weights[j] / weight_sum[j]


Summary for 31939.txt written to output_summaries/summary_31939.txt
Summary for 31839.txt written to output_summaries/summary_31839.txt
Summary for 32886.txt written to output_summaries/summary_32886.txt
Summary for 31333.txt written to output_summaries/summary_31333.txt
Summary for 32602.txt written to output_summaries/summary_32602.txt
Summary for 33038.txt written to output_summaries/summary_33038.txt
Summary for 31654.txt written to output_summaries/summary_31654.txt
Summary for 31857.txt written to output_summaries/summary_31857.txt
Summary for 31620.txt written to output_summaries/summary_31620.txt
Summary for 30894.txt written to output_summaries/summary_30894.txt
Summary for 31033.txt written to output_summaries/summary_31033.txt
Summary for 31007.txt written to output_summaries/summary_31007.txt
Summary for 31224.txt written to output_summaries/summary_31224.txt
Summary for 32143.txt written to output_summaries/summary_32143.txt
Summary for 31037.txt written to output_summarie

Summary for 32004.txt written to output_summaries/summary_32004.txt
Summary for 33037.txt written to output_summaries/summary_33037.txt
Summary for 30950.txt written to output_summaries/summary_30950.txt
Summary for 32560.txt written to output_summaries/summary_32560.txt
Summary for 31319.txt written to output_summaries/summary_31319.txt
Summary for 31023.txt written to output_summaries/summary_31023.txt
Summary for 30820.txt written to output_summaries/summary_30820.txt
Summary for 31940.txt written to output_summaries/summary_31940.txt
Summary for 31948.txt written to output_summaries/summary_31948.txt
Summary for 31508.txt written to output_summaries/summary_31508.txt
Summary for 30783.txt written to output_summaries/summary_30783.txt
Summary for 32032.txt written to output_summaries/summary_32032.txt
Summary for 31484.txt written to output_summaries/summary_31484.txt
Summary for 31026.txt written to output_summaries/summary_31026.txt
Summary for 32378.txt written to output_summarie

Summary for 31005.txt written to output_summaries/summary_31005.txt
Summary for 32543.txt written to output_summaries/summary_32543.txt
Summary for 31034.txt written to output_summaries/summary_31034.txt
Summary for 31879.txt written to output_summaries/summary_31879.txt
Summary for 31573.txt written to output_summaries/summary_31573.txt
Summary for 30800.txt written to output_summaries/summary_30800.txt
Summary for 32071.txt written to output_summaries/summary_32071.txt
Summary for 31399.txt written to output_summaries/summary_31399.txt
Summary for 31265.txt written to output_summaries/summary_31265.txt
Summary for 32185.txt written to output_summaries/summary_32185.txt
Summary for 30838.txt written to output_summaries/summary_30838.txt
Summary for 31970.txt written to output_summaries/summary_31970.txt
Summary for 32717.txt written to output_summaries/summary_32717.txt
Summary for 32072.txt written to output_summaries/summary_32072.txt
Summary for 32849.txt written to output_summarie

# create CSV file for generated extractive summary

In [7]:

# Folder path
folder_path = "output_summaries"

# Initialize an empty list to store data
data = []
# Iterate over files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    # Check if the item in the folder is a file (not a subfolder)
    if os.path.isfile(file_path):
        # Read the content of the file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Append data to the list
        data.append((filename, content))

# Create a DataFrame
rd_extra_summ = pd.DataFrame(data, columns=["Filename", "Ext_Summary"])

# Set the index to be "31839"
rd_extra_summ.set_index("Filename", inplace=True)

# Save the DataFrame to a CSV file
rd_extra_summ.to_csv("Ext_Summary_50_files_lexRank.csv")


In [8]:
# Display the DataFrame
print(rd_extra_summ)

                                                         Ext_Summary
Filename                                                            
summary_32044.txt  25546.04    9 August 2017  PM    Proof Six NCC...
summary_33070.txt  Tellitstraight Reuters Annual Report and Form ...
summary_32139.txt  ANNUAL REPORT AND ACCOUNTS 2017 50 YEARS OF  I...
summary_32480.txt  Yorkshire Water  Services Limited Annual Repor...
summary_30817.txt  Annual Report 201 7 Strategic Report   What we...
...                                                              ...
summary_32540.txt  HAYWARD TYLER GROUP PLC     REPORT & ACCOUNTS ...
summary_31618.txt  ITE Group plc  Annual Report and Accounts 2017...
summary_31037.txt  ANNUAL REPORT Symphony Environmental Technolog...
summary_32134.txt  ANNUAL REPORT AND   FINANCIAL STATEMENTS  2016...
summary_30897.txt  Ubisense Group plc Annual Report 2017 Ubisense...

[363 rows x 1 columns]


In [9]:
rd_extra_summ['Numeric_Filename'] = rd_extra_summ.index.str.extract('(\d+)')

In [10]:
rd_extra_summ.head()

Unnamed: 0_level_0,Ext_Summary,Numeric_Filename
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1
summary_32044.txt,25546.04 9 August 2017 PM Proof Six NCC...,
summary_33070.txt,Tellitstraight Reuters Annual Report and Form ...,
summary_32139.txt,ANNUAL REPORT AND ACCOUNTS 2017 50 YEARS OF I...,
summary_32480.txt,Yorkshire Water Services Limited Annual Repor...,
summary_30817.txt,Annual Report 201 7 Strategic Report What we...,


In [11]:
import os
import csv

folder_path = 'output_summaries'
output_csv_path = 'output_summary.csv'

# Initialize CSV file
with open(output_csv_path, 'w', newline='') as csvfile:
    fieldnames = ['filename', 'extra_summary']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write header row
    writer.writeheader()

    # Iterate through text files
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)

            # Extract the file number (assuming the filename is in the format '17.txt', '18.txt', etc.)
            file_number = os.path.splitext(filename)[0]

            # Read content from the file
            with open(file_path, 'r') as file:
                content = file.read()

            # Write to CSV
            writer.writerow({'filename': file_number, 'extra_summary': content})

print(f'CSV file created at {output_csv_path}')

CSV file created at output_summary.csv


In [12]:
import pandas as pd

In [13]:
res_ext_summ = pd.read_csv("output_summary.csv")

In [14]:
res_ext_summ.head()

Unnamed: 0,filename,extra_summary
0,summary_32044,25546.04 9 August 2017 PM Proof Six NCC...
1,summary_33070,Tellitstraight Reuters Annual Report and Form ...
2,summary_32139,ANNUAL REPORT AND ACCOUNTS 2017 50 YEARS OF I...
3,summary_32480,Yorkshire Water Services Limited Annual Repor...
4,summary_30817,Annual Report 201 7 Strategic Report What we...


In [15]:
res_ext_summ['extra_summary'][0]

'25546.04    9 August 2017  PM    Proof Six NCC Group plc  Annual Report and Accounts  for the year ended 31 May 2017 NCC Group plc        Stock Code: NCC securing   tomorrow,   today Annual Report and Accounts for the year ended 31 May 2017 NCC AR2017 - proof 8-Front.indd   3     25546.04    9 August 2017  PM    Proof Six Why   we exist NCC Group is a global expert  in cyber security and risk  mitigation, working with  businesses to protect  their brand, data (including  intellectual property), value and  reputations against the ever- evolving threat landscape. The Group’s independence, knowledge,  experience and global footprint ensures that  NCC Group can help businesses identify,  assess, mitigate and respond to the risks they  face within this fluid and hostile environment. NCC Group is passionate about changing the  shape of the internet to make it safer and  revolutionising the way in which organisations  think about cyber security. NCC Group currently operates from over   30 of

# abstractive summary

In [16]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# !pip install sentencepiece

In [17]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Load pre-trained PEGASUS model and tokenizer
model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

  return self.fget.__get__(instance, owner)()
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
res_ext_summ['token_count'] = res_ext_summ['extra_summary'].apply(lambda x: len(tokenizer.encode(x, max_length=1024, truncation=True)))

In [19]:
res_ext_summ.head()

Unnamed: 0,filename,extra_summary,token_count
0,summary_32044,25546.04 9 August 2017 PM Proof Six NCC...,780
1,summary_33070,Tellitstraight Reuters Annual Report and Form ...,666
2,summary_32139,ANNUAL REPORT AND ACCOUNTS 2017 50 YEARS OF I...,301
3,summary_32480,Yorkshire Water Services Limited Annual Repor...,509
4,summary_30817,Annual Report 201 7 Strategic Report What we...,699


In [20]:
res_ext_summ['token_count'].max()

1024

In [21]:
res_ext_summ['token_count'].min()

45

In [22]:
# Tokenize and generate summary
def generate_abs_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=512, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [23]:
# Apply the function to each row in 'extra_summary' column
res_ext_summ['gen_abs_summary'] = res_ext_summ['extra_summary'].apply(generate_abs_summary)

In [24]:
res_ext_summ.head()

Unnamed: 0,filename,extra_summary,token_count,gen_abs_summary
0,summary_32044,25546.04 9 August 2017 PM Proof Six NCC...,780,25546.04 9 August 2017 PM Proof Six NCC Group ...
1,summary_33070,Tellitstraight Reuters Annual Report and Form ...,666,Tellitstraight Reuters Annual Report and Form ...
2,summary_32139,ANNUAL REPORT AND ACCOUNTS 2017 50 YEARS OF I...,301,"2012 AVEVA acquires Bocad, adding advanced str..."
3,summary_32480,Yorkshire Water Services Limited Annual Repor...,509,Yorkshire Water Services Limited Annual Report...
4,summary_30817,Annual Report 201 7 Strategic Report What we...,699,Annual Report 201 7 Strategic Report What we d...


In [25]:
# Assuming 'res_ext_summ' is your DataFrame
res_ext_summ.to_csv('abs_summary_peg.csv', index=False)

In [26]:
x = pd.read_csv('abs_summary_peg.csv')

In [27]:
x.head()

Unnamed: 0,filename,extra_summary,token_count,gen_abs_summary
0,summary_32044,25546.04 9 August 2017 PM Proof Six NCC...,780,25546.04 9 August 2017 PM Proof Six NCC Group ...
1,summary_33070,Tellitstraight Reuters Annual Report and Form ...,666,Tellitstraight Reuters Annual Report and Form ...
2,summary_32139,ANNUAL REPORT AND ACCOUNTS 2017 50 YEARS OF I...,301,"2012 AVEVA acquires Bocad, adding advanced str..."
3,summary_32480,Yorkshire Water Services Limited Annual Repor...,509,Yorkshire Water Services Limited Annual Report...
4,summary_30817,Annual Report 201 7 Strategic Report What we...,699,Annual Report 201 7 Strategic Report What we d...


In [28]:
x['filename']

0      summary_32044
1      summary_33070
2      summary_32139
3      summary_32480
4      summary_30817
           ...      
358    summary_32540
359    summary_31618
360    summary_31037
361    summary_32134
362    summary_30897
Name: filename, Length: 363, dtype: object

In [29]:
# Assuming 'x' is your DataFrame
x['numeric_part'] = x['filename'].str.extract(r'(\d+)')

In [30]:
x['numeric_part']

0      32044
1      33070
2      32139
3      32480
4      30817
       ...  
358    32540
359    31618
360    31037
361    32134
362    30897
Name: numeric_part, Length: 363, dtype: object

# take actual summary_1

In [31]:
# Assuming 'x' is your DataFrame
folder_path = 'fns2020_dataset/validation/gold_summaries'

def read_summary_file(file_name):
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r') as file:
        return file.read()

# Apply the function to each row in 'numeric_part' column and create a new column 'file_content'
x['file_content'] = x['numeric_part'].apply(lambda num: read_summary_file(f'{num}_1.txt'))

In [32]:
x.head()

Unnamed: 0,filename,extra_summary,token_count,gen_abs_summary,numeric_part,file_content
0,summary_32044,25546.04 9 August 2017 PM Proof Six NCC...,780,25546.04 9 August 2017 PM Proof Six NCC Group ...,32044,25546.04 9 August 2017 3:58 PM Proof Si...
1,summary_33070,Tellitstraight Reuters Annual Report and Form ...,666,Tellitstraight Reuters Annual Report and Form ...,33070,CHIEF EXECUTIVE’S REVIEW\nReuters Group PLC A...
2,summary_32139,ANNUAL REPORT AND ACCOUNTS 2017 50 YEARS OF I...,301,"2012 AVEVA acquires Bocad, adding advanced str...",32139,14\nAVEVA GROUP PLC ANNUAL REPORT AND ACCOUNT...
3,summary_32480,Yorkshire Water Services Limited Annual Repor...,509,Yorkshire Water Services Limited Annual Report...,32480,Chief Executive’s Overview\nGood progress\nTh...
4,summary_30817,Annual Report 201 7 Strategic Report What we...,699,Annual Report 201 7 Strategic Report What we d...,30817,Strategic Report \nQ&A with Interim Group Ch...


In [33]:
x

Unnamed: 0,filename,extra_summary,token_count,gen_abs_summary,numeric_part,file_content
0,summary_32044,25546.04 9 August 2017 PM Proof Six NCC...,780,25546.04 9 August 2017 PM Proof Six NCC Group ...,32044,25546.04 9 August 2017 3:58 PM Proof Si...
1,summary_33070,Tellitstraight Reuters Annual Report and Form ...,666,Tellitstraight Reuters Annual Report and Form ...,33070,CHIEF EXECUTIVE’S REVIEW\nReuters Group PLC A...
2,summary_32139,ANNUAL REPORT AND ACCOUNTS 2017 50 YEARS OF I...,301,"2012 AVEVA acquires Bocad, adding advanced str...",32139,14\nAVEVA GROUP PLC ANNUAL REPORT AND ACCOUNT...
3,summary_32480,Yorkshire Water Services Limited Annual Repor...,509,Yorkshire Water Services Limited Annual Report...,32480,Chief Executive’s Overview\nGood progress\nTh...
4,summary_30817,Annual Report 201 7 Strategic Report What we...,699,Annual Report 201 7 Strategic Report What we d...,30817,Strategic Report \nQ&A with Interim Group Ch...
...,...,...,...,...,...,...
358,summary_32540,HAYWARD TYLER GROUP PLC REPORT & ACCOUNTS ...,381,Markets we serve Power Generation: Oil & Gas: ...,32540,Hayward Tyler Group PLC \nFinancial statement...
359,summary_31618,ITE Group plc Annual Report and Accounts 2017...,574,3 Like-for-like results are stated on a consta...,31618,Financial statements Governance Strategic rep...
360,summary_31037,ANNUAL REPORT Symphony Environmental Technolog...,403,ANNUAL REPORT Symphony Environmental Technolog...,31037,Chief Executive’s Review\nMichael Laurier\nSy...
361,summary_32134,ANNUAL REPORT AND FINANCIAL STATEMENTS 2016...,507,Group underlying sales* Profit before tax £12....,32134,STRATEGIC REPORT\nChief Executive's review\nI...


In [39]:
x['gen_abs_summary'][100]

'2002 Annual Report& Accounts DICOM Group plc Leading in Products and Services for the Electronic Document Capture Market and the Automation of Related Business Transactions DOC UMENT S IN INF ORMA TION OUT 13 39 14 18 20 23 6 DICOM Group plc 2 Annual Report 2002 3 4 About DICOM Group 5 Financial Highlights 6 Chairman’s Statement 9 Chief Executive s Review 11 Financial Review 13 Making Your Information Digital 14 Our Customers 17 Our Global EDC Team 18 Our EDC Products 20 Our Commitment to the Future 21 SGA Division 22 International Presence 23 Team & Organisation 24 Company News 25 The DICOM Group Share 27 Directors 29 Directors’ Report 31 Corporate Governance Statement 34 Remuneration Report 36 Statement of Directors’ Responsibilities 37 Independent Auditors’ Report 39 Consolidated Financial Statements 45 Notes to the Financial Statements 62 Five Year Record 63 Notice of Annual General Meeting 68 Company Secretary and Advisers 68 Principal Subsidiaries C ONTENT S DICOM Group plc 4 DI

In [40]:
x['file_content'][100]

' Annual Report 2002\n9\nI am delighted to provide a review of the trading\ncompanies in my new position as Chief\nExecutive of DICOM Group. Since being founded\n11 years ago and as a result of the vision and\nleadership provided by my predecessor Otto\nSchmid, the Group has shown an exceptional\ndevelopment. We were established as a Swiss\ncompany trading IT peripherals and have gone\nthrough a substantial evolution since. Today, we\nare recognised as the worldwide leader in our\ntarget markets, active in over 60 countries\naround the globe.\nIn the past most enterprises have heavily\ninvested in a broad range of IT solutions that\nautomate their business transactions and\nsupport decision making. However connectivity\nbetween input devices and applications is\nfrequently inadequate or absent resulting in\ndata and document access being imprecise,\nslow and hence expensive.\nThe EDC solutions offered by DICOM Group\nsolve this dilemma at an attractive cost to\nbenefit ratio. We have b

In [45]:
x.head()

Unnamed: 0,filename,extra_summary,token_count,gen_abs_summary,numeric_part,file_content
0,summary_32044,25546.04 9 August 2017 PM Proof Six NCC...,780,25546.04 9 August 2017 PM Proof Six NCC Group ...,32044,25546.04 9 August 2017 3:58 PM Proof Si...
1,summary_33070,Tellitstraight Reuters Annual Report and Form ...,666,Tellitstraight Reuters Annual Report and Form ...,33070,CHIEF EXECUTIVE’S REVIEW\nReuters Group PLC A...
2,summary_32139,ANNUAL REPORT AND ACCOUNTS 2017 50 YEARS OF I...,301,"2012 AVEVA acquires Bocad, adding advanced str...",32139,14\nAVEVA GROUP PLC ANNUAL REPORT AND ACCOUNT...
3,summary_32480,Yorkshire Water Services Limited Annual Repor...,509,Yorkshire Water Services Limited Annual Report...,32480,Chief Executive’s Overview\nGood progress\nTh...
4,summary_30817,Annual Report 201 7 Strategic Report What we...,699,Annual Report 201 7 Strategic Report What we d...,30817,Strategic Report \nQ&A with Interim Group Ch...


In [46]:
# Assuming 'res_ext_summ' is your DataFrame
x.to_csv('final_abs_summary_peg.csv', index=False)

# Create Txt Files From CSV

In [47]:
abs_summ = pd.read_csv('final_abs_summary_peg.csv')

In [49]:
abs_summ.tail()

Unnamed: 0,filename,extra_summary,token_count,gen_abs_summary,numeric_part,file_content
358,summary_32540,HAYWARD TYLER GROUP PLC REPORT & ACCOUNTS ...,381,Markets we serve Power Generation: Oil & Gas: ...,32540,Hayward Tyler Group PLC \nFinancial statement...
359,summary_31618,ITE Group plc Annual Report and Accounts 2017...,574,3 Like-for-like results are stated on a consta...,31618,Financial statements Governance Strategic rep...
360,summary_31037,ANNUAL REPORT Symphony Environmental Technolog...,403,ANNUAL REPORT Symphony Environmental Technolog...,31037,Chief Executive’s Review\nMichael Laurier\nSy...
361,summary_32134,ANNUAL REPORT AND FINANCIAL STATEMENTS 2016...,507,Group underlying sales* Profit before tax £12....,32134,STRATEGIC REPORT\nChief Executive's review\nI...
362,summary_30897,Ubisense Group plc Annual Report 2017 Ubisense...,419,Our solutions are based on powerful enterprise...,30897,Bringing the \ndigital twin to life\nWe have ...


In [50]:
df = pd.DataFrame(abs_summ)
output_directory = 'output_abs_summaries'
os.makedirs(output_directory, exist_ok=True)

# Save each row's "gen_abs_summary" into a separate text file
for index, row in df.iterrows():
    numeric_part = row['numeric_part']
    summary_text = row['gen_abs_summary']

    # Creating the file name
    file_name = os.path.join(output_directory, f'{numeric_part}_summ.txt')

    # Saving the text to the file
    with open(file_name, 'w') as file:
        file.write(summary_text)

# Calculate Average Rouge Scores

In [53]:
# Assuming your data is in a CSV file named 'your_dataset.csv'
# If it's in a different format, adjust the read function accordingly
df = pd.read_csv('results.csv')

# Separate data for each ROUGE category
rouge_l_data = df[df['ROUGE-Type'] == 'ROUGE-L+StopWordRemoval+Stemming']
rouge_1_data = df[df['ROUGE-Type'] == 'ROUGE-1+StopWordRemoval+Stemming']
rouge_2_data = df[df['ROUGE-Type'] == 'ROUGE-2+StopWordRemoval+Stemming']
rouge_su4_data = df[df['ROUGE-Type'] == 'ROUGE-SU4+StopWordRemoval+Stemming']

# Calculate averages for each category
avg_rouge_l = rouge_l_data[['Avg_Recall', 'Avg_Precision', 'Avg_F-Score']].mean()
avg_rouge_1 = rouge_1_data[['Avg_Recall', 'Avg_Precision', 'Avg_F-Score']].mean()
avg_rouge_2 = rouge_2_data[['Avg_Recall', 'Avg_Precision', 'Avg_F-Score']].mean()
avg_rouge_su4 = rouge_su4_data[['Avg_Recall', 'Avg_Precision', 'Avg_F-Score']].mean()

# Print the results with labels
print("Average for ROUGE-L:")
print(avg_rouge_l)

print("\nAverage for ROUGE-1:")
print(avg_rouge_1)

print("\nAverage for ROUGE-2:")
print(avg_rouge_2)

print("\nAverage for ROUGE-SU4:")
print(avg_rouge_su4)

Average for ROUGE-L:
Avg_Recall       0.132869
Avg_Precision    0.481902
Avg_F-Score      0.193183
dtype: float64

Average for ROUGE-1:
Avg_Recall       0.152009
Avg_Precision    0.509783
Avg_F-Score      0.205946
dtype: float64

Average for ROUGE-2:
Avg_Recall       0.082421
Avg_Precision    0.190772
Avg_F-Score      0.094931
dtype: float64

Average for ROUGE-SU4:
Avg_Recall       0.119188
Avg_Precision    0.224869
Avg_F-Score      0.129694
dtype: float64
