## 1. Importing libraries, dependencies, installing packages

In [1]:
# Install the 'rouge' package using pip
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [2]:
# Importing drive module from google colab for file access
from google.colab import drive
# Operating System module for interacting with the operating system
import os
# Pandas library for data manipulation and analysis
import pandas as pd
# Numpy library for numerical operations
import numpy as np
# Regular expression module for pattern matching and text processing
import re
# Natural Language Toolkit for text processing tasks
import nltk
# For tokenization
nltk.download('punkt')
# Tokenizer for word tokenization
from nltk.tokenize import word_tokenize
# Time module for time-related functions
import time
# Abstract Syntax Trees module for parsing Python code
import ast
# Train-test split function for splitting data
from sklearn.model_selection import train_test_split
# Transformers library for RoBERTa model
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
# DataLoader and Dataset classes for handling data
from torch.utils.data import DataLoader, Dataset
# tqdm library for progress bars
from tqdm import tqdm
# PyTorch library for deep learning
import torch
# Accuracy score metric for classification tasks
from sklearn.metrics import accuracy_score
# Automatic Mixed Precision for improved performance
from torch.cuda.amp import autocast, GradScaler
# Metrics for evaluating classification performance
from sklearn.metrics import confusion_matrix, classification_report
# ngrams function for generating n-grams from text
from nltk import ngrams
# Counter class for counting occurrences of elements
from collections import Counter
# Rouge metric for text summarization evaluation
from rouge import Rouge
# Functional module for PyTorch operations
import torch.nn.functional as F
# Garbage collection module for memory management
import gc


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 2. Setting Up File Paths and Directory

In [3]:
# Mount Google Drive to access files
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# Set current path
curr_path = '/content/drive/MyDrive/NLP_Roberta_Final'
os.chdir(curr_path)

# Set the path to the training data for annual reports and gold summaries
training_data_path_reports = "./fns2020_dataset/training/annual_reports"
training_data_path_summaries = "./fns2020_dataset/training/gold_summaries"


## 3. Data Preprocessing

### i) Annual Reports

In [5]:
# Define functions for text preprocessing and segmenting
def preprocess_text(text):
    '''
      Function to preprocess the text: Remove extra spaces, convert to lowercase, and remove special characters
    '''
    text = ' '.join(text.split())
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def divide_into_segments(text, segment_length=250):
    '''
      Function to divide text into segments of 250 words
    '''
    words = word_tokenize(text)
    segment_length_words = max(1, min(segment_length, len(words)))
    segments = [words[i:i + segment_length_words] for i in range(0, len(words), segment_length_words)]
    return segments

# Initialize empty lists to store data and create a dataset
file_paths = []
texts = []
seg_nums = []

# Walk through the folders and preprocess the text data
for root, dirs, files in os.walk(training_data_path_reports):
    for file in files:
        # Read text file
        file_path = os.path.join(root, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        # Preprocess the text
        preprocessed_text = preprocess_text(text)
        # Divide into segments
        segments = divide_into_segments(preprocessed_text)
        # Append to lists
        segs = [i+1 for i in range(len(segments))]
        file_paths.extend([file_path] * len(segments))
        seg_nums.extend(segs)
        texts.extend(segments)

# Create a DataFrame for the training data segments
df_reports = pd.DataFrame({'Report_File_Path': file_paths, 'Segment_No': seg_nums,'Segment_Text': texts})

# Save the DataFrame to a CSV file
df_reports.to_csv('Annual_training_data_segments.csv', index=False)


### ii) Gold Summaries

In [6]:
import pandas as pd

# Define the path to the training data for gold_summaries
training_data_path = "./fns2020_dataset/training/gold_summaries"

# Read gold summary data
def divide_into_segments_summaries(text, segment_length=float('inf')):
    '''
      Function to add summaries as segments
    '''
    words = word_tokenize(text)
    segment_length_words = max(1, min(segment_length, len(words)))
    segments = [words[i:i + segment_length_words] for i in range(0, len(words), segment_length_words)]
    return segments

# Initialize empty lists to store data for dataframe
file_paths = []
texts = []
cnt = 0
# Walk through the folders
for root, dirs, files in os.walk(training_data_path):
    for file in files:
        # Read each text file
        file_path = os.path.join(root, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        preprocessed_text = preprocess_text(text)
        segments = divide_into_segments_summaries(preprocessed_text)
        file_paths.extend([file_path] * len(segments))
        texts.extend(segments)

# Create a DataFrame
df = pd.DataFrame({'Summary_File_Path': file_paths, 'Segment': texts})
print(df.head())
df.to_csv('Summary_training_data_segments.csv', index=False)


                                   Summary_File_Path  \
0  ./fns2020_dataset/training/gold_summaries/17_3...   
1  ./fns2020_dataset/training/gold_summaries/19_2...   
2  ./fns2020_dataset/training/gold_summaries/19_1...   
3  ./fns2020_dataset/training/gold_summaries/17_2...   
4  ./fns2020_dataset/training/gold_summaries/19_3...   

                                             Segment  
0  [04, chairmans, statement, john, standen, none...  
1  [the, sentiment, within, the, group, is, one, ...  
2  [joint, chief, executives, st, atement, introd...  
3  [z, group, plc, annual, report, and, accounts,...  
4  [godsend, your, service, has, been, a, godsend...  


In [7]:
# Adding annual report path and summary number for each summary segments
df_processed = pd.read_csv('./Summary_training_data_segments.csv')
df_processed['Report_File_Path'] = df_processed.Summary_File_Path.str.replace('gold_summaries', 'annual_reports')
df_processed['SummaryNo'] = df_processed['Report_File_Path'].str.split('_').str[3].str.split('.').str[0]
for i in range(8):
    df_processed['Report_File_Path'] = df_processed['Report_File_Path'].str.replace('_'+str(i), '')
df_processed.to_csv('Summary_training_data_segments_with_summary_number.csv', index=False)

## 4. Creating dataset

#### Importing preprocessed data

In [8]:
# Read CSV files into pandas DataFrames
df1 = pd.read_csv("Annual_training_data_segments.csv")
df2 = pd.read_csv("Summary_training_data_segments_with_summary_number.csv")

# Merge DataFrames on 'Report_File_Path' column
df_hybrid_new = pd.merge(df1, df2, on='Report_File_Path', how='inner')

# Add new columns 'Label' and 'OverlapCount' with initial values as 0
df_hybrid_new['Label'] = 0
df_hybrid_new['OverlapCount'] = 0

# Select desired columns in the final DataFrame
df_hybrid_new = df_hybrid_new[['Report_File_Path', 'Segment_No', 'Summary_File_Path', 'SummaryNo', 'OverlapCount', 'Label']]


#### Selecting report files for dataset labelling

In [9]:
# Function to count number of files in a directory
def count_files(directory_path):
    '''
    Function to count number of files in directory
    '''
    try:
        files = os.listdir(directory_path)
        file_count = len(files)
        return file_count
    except FileNotFoundError:
        print(f"The directory '{directory_path}' does not exist.")
        return 0

# Directory path for the training annual reports
directory_path = './fns2020_dataset/training/annual_reports/'

# Count the number of files in the directory
files_count = count_files(directory_path)
print(f"Number of files in the directory '{directory_path}': {files_count}")

# Set the number of selected files for further processing
no_of_selected_files = files_count
folder_path = "./fns2020_dataset/training/annual_reports/"

# Get file names in the folder
file_names = os.listdir(folder_path)

# Get paths for selected report files
selected_report_files = ["./fns2020_dataset/training/annual_reports/"+str(file_names[i]) for i in range(no_of_selected_files)]


Number of files in the directory './fns2020_dataset/training/annual_reports/': 5


#### Comparing segments of annual reports with each summary, overlap count, labelling segments

In [10]:
# Filter DataFrame based on selected report files
df_hybrid = df_hybrid_new[df_hybrid_new['Report_File_Path'].isin(selected_report_files)].copy()

# Function to compare two strings at unigram token level
def compare_segments(segment_text, segment):
    '''
    Function to compare two strings at unigram token level
    '''
    tokens_text = set(word_tokenize(segment_text.lower()))
    tokens_segment = set(word_tokenize(segment.lower()))
    overlap_count = len(tokens_text.intersection(tokens_segment))
    return overlap_count

# Iterate over DataFrame rows and compare segment texts
t1 = time.time()
for index, row in df_hybrid.iterrows():
    report_file_path = row['Report_File_Path']
    segment_no = row['Segment_No']
    summary_file_path = row['Summary_File_Path']

    segment_text = df1[(df1['Report_File_Path'] == report_file_path) & (df1['Segment_No'] == segment_no)]['Segment_Text'].values[0]
    summary_text = df2[df2['Summary_File_Path'] == summary_file_path]['Segment'].values[0]

    # Compare Segment_Text with Segment and update overlap_count accordingly
    overlap_count = compare_segments(segment_text, summary_text)
    df_hybrid.at[index,'OverlapCount'] = overlap_count
    if overlap_count >= 75:
        df_hybrid.at[index, 'Label'] = 1
    else:
        df_hybrid.at[index, 'Label'] = 0
t2 = time.time()
# Save DataFrame to a CSV file
df_hybrid.to_csv('Generated_Hybrid_Dataset_'+str(no_of_selected_files)+'_files.csv', index=False)

#### Grouping, selecting, labelling segments matching any summary, generating input dataset for Roberta model

In [11]:


# Read generated hybrid dataset CSV file
roberta_df = pd.read_csv('Generated_Hybrid_Dataset_'+str(no_of_selected_files)+'_files.csv')

# Add empty columns 'SegmentText' and 'Summary' to DataFrame
roberta_df['SegmentText'] = ''
roberta_df['Summary'] = ''
roberta_df_processed = roberta_df.copy()

# Create dictionaries for DataFrame columns
df1_dict = df1.set_index(['Report_File_Path', 'Segment_No'])['Segment_Text'].to_dict()
df2_dict = df2.set_index('Summary_File_Path')['Segment'].to_dict()

# Process SegmentText and Summary for the entire DataFrame
roberta_df_processed['SegmentText'] = roberta_df_processed.apply(lambda row: ' '.join(ast.literal_eval(df1_dict.get((row['Report_File_Path'], row['Segment_No']), ''))), axis=1)
roberta_df_processed['Summary'] = roberta_df_processed.apply(lambda row: ' '.join(ast.literal_eval(df2_dict.get(row['Summary_File_Path'], ''))), axis=1)

# Save processed DataFrame to a CSV file
roberta_df_processed.to_csv('Generated_Hybrid_Dataset_With_Segments_And_Summary_'+str(no_of_selected_files)+'_files.csv', index=False)

# Read processed hybrid dataset CSV file
df_atleast_one = pd.read_csv('Generated_Hybrid_Dataset_With_Segments_And_Summary_'+str(no_of_selected_files)+'_files.csv')

# Group DataFrame by 'Report_File_Path' and 'Segment_No', aggregate 'Label' column using 'any' function
result_df = df_atleast_one.groupby(['Report_File_Path', 'Segment_No'], as_index=False).agg({'Label': 'any', 'SegmentText': 'first'})

# Convert 'Label' column to integer type
result_df['Label'] = result_df['Label'].astype(int)

# Save final DataFrame to a CSV file
result_df.to_csv('Generated_Training_Hybrid_Dataset_With_Segments_And_Label_'+str(no_of_selected_files)+'_files.csv', index=False)





## 5. Fine tuning Roberta model

In [12]:
# Load the dataset
df = pd.read_csv('Generated_Training_Hybrid_Dataset_With_Segments_And_Label_'+str(no_of_selected_files)+'_files.csv')

# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base")

# Tokenize the input data
max_length = 256
tokenized_inputs = tokenizer(list(df['SegmentText']), padding=True, truncation=True, max_length=max_length, return_tensors="pt")

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Extract labels
labels = list(df['Label'])

# Split the dataset into training and testing sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(tokenized_inputs['input_ids'],
                                                                        labels,
                                                                        test_size=0.2,
                                                                        random_state=42)

# Create train and test datasets
train_dataset = CustomDataset(input_ids=train_inputs,
                              attention_mask=tokenized_inputs['attention_mask'][:len(train_inputs)],
                              labels=train_labels)

test_dataset = CustomDataset(input_ids=test_inputs,
                             attention_mask=tokenized_inputs['attention_mask'][len(train_inputs):],
                             labels=test_labels)

# Create train and test dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Set up the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Set up the learning rate scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
gradient_accumulation_steps = 4
scaler = GradScaler()

checkpoint_dir = 'checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Optionally, load the latest checkpoint if available
latest_checkpoint = max([f for f in os.listdir(checkpoint_dir) if f.startswith('checkpoint')], key=lambda x: int(x.split('_')[2][:-3]), default=None)
if latest_checkpoint:
    checkpoint = torch.load(os.path.join(checkpoint_dir, latest_checkpoint))
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scaler.load_state_dict(checkpoint['scaler_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
else:
    start_epoch = 0

for epoch in range(start_epoch, num_epochs):
    model.train()
    for batch_idx, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")):
        inputs = {key: value.to(device) for key, value in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        with autocast():
            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)
        loss = loss / gradient_accumulation_steps
        scaler.scale(loss).backward()

        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

    # Update the learning rate
    scheduler.step()

    # Save checkpoint at the end of each epoch
    checkpoint_name = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch + 1}.pt')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scaler_state_dict': scaler.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
    }, checkpoint_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Model evaluaion

In [13]:
# Evaluation
model.eval()
predictions = []

for batch in tqdm(test_dataloader, desc="Evaluating"):
    inputs = {key: value.to(device) for key, value in batch.items() if key != 'labels'}
    labels = batch['labels'].to(device)

    with autocast():
        outputs = model(**inputs)
    predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy}")

# Create confusion matrix
conf_matrix = confusion_matrix(test_labels, predictions)
print("Confusion Matrix:")
print(conf_matrix)
class_report = classification_report(test_labels, predictions)
print("\nClassification Report:")
print(class_report)

Evaluating: 100%|██████████| 10/10 [00:01<00:00,  7.55it/s]

Accuracy: 0.7702702702702703
Confusion Matrix:
[[57  0]
 [17  0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        57
           1       0.00      0.00      0.00        17

    accuracy                           0.77        74
   macro avg       0.39      0.50      0.44        74
weighted avg       0.59      0.77      0.67        74




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Saving fine tuned model

In [None]:

model.save_pretrained("Roberta_Classification_Model_Fine_Tuned")


## 6. Validation dataset: Label prediction, Summary generation, ROUGE score calculation

#### Common functions for Solution 1 and Solution 2 methods

### i) Solution 1 : Confidence score method

In [14]:
import sys
sys.setrecursionlimit(10**6)

In [20]:
test_data_path = "./fns2020_dataset/validation/annual_reports/"

def list_top_files(folder_path, top_n=files_count):
    '''
    Function to create the list of top 'N' files
    '''
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    sorted_files = sorted(files, key=lambda x: os.path.getsize(os.path.join(folder_path, x)), reverse=True)
    top_files = sorted_files[:top_n]
    #print(top_files)
    return top_files

def preprocess_text(text):
    '''
    Function to preprocess the text, Remove extra spaces, Convert to lower case, and Remove special characters
    '''
    text = ' '.join(text.split())
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text


def preprocess_text_for_generatedSummary(text):
    '''
    Function to preprocess the text, Remove extra spaces, and Remove special characters for summary
    '''
    text = ' '.join(text.split())
    text = re.sub(r'[^a-zA-Z0-9.\s]', '', text)

    return text


def divide_into_segments(text, segment_length=250):
    '''
    Function to divide text into segments of 250 words
    '''
    words = word_tokenize(text)
    segment_length_words = max(1, min(segment_length, len(words)))
    segments = [words[i:i + segment_length_words] for i in range(0, len(words), segment_length_words)]
    return segments

def divide_into_segments_summary(text, segment_length=250):
    '''
    Function to divide text into segments of 250 words for summary
    '''

    words = word_tokenize(text)
    segment_length_words = max(1, min(segment_length, len(words)))
    segments = []
    current_segment = []
    for word in words:
        if word == '.':
            if current_segment and current_segment[-1] != '.':
                current_segment[-1] += '.'
        else:
            current_segment.append(word)
        if len(current_segment) >= segment_length_words:
            segments.append(current_segment)
            current_segment = []
    if current_segment:
        segments.append(current_segment)
    return segments


top_files_list = list_top_files(test_data_path, top_n=files_count)

file_paths = []
texts = []
seg_nums = []
segment_summaries = []
cnt = 0
selected_files = list(map(lambda x: test_data_path + x, top_files_list ))
#print(selected_files)
# Walk through the folders
for root, dirs, files in os.walk(test_data_path):
    for file in files:
        # Read each text file
        file_path = os.path.join(root, file)
        if file_path not in selected_files:
            continue
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        preprocessed_text_summary = preprocess_text_for_generatedSummary(text)
        # Divide into segments
        segments_summary = divide_into_segments_summary(preprocessed_text_summary)
        # Append to lists
        segs_summary = [i+1 for i in range(len(segments_summary))]
        # Pre-process the text
        preprocessed_text = preprocess_text(text)
        # Divide into segments
        segments = divide_into_segments(preprocessed_text)
        # Append to lists
        segs = [i+1 for i in range(len(segments))]
        # Preprocess for candidate summary

        file_paths.extend([file_path] * len(segments))
        seg_nums.extend(segs)
        texts.extend(segments)
        segment_summaries.extend(segments_summary)


# Create a DataFrame
df = pd.DataFrame({'Report_File_Path': file_paths, 'Segment_No': seg_nums,'Segment_Text': texts, 'segments_for_summary':segment_summaries})
#df['Segment_Text'] = df['Segment_Text'].apply(ast.literal_eval)
df['Segment_Text'] = df['Segment_Text'].apply(lambda x: ' '.join(x))
df['segments_for_summary'] = df['segments_for_summary'].apply(lambda x: ' '.join(x))
# Display the DataFrame


df.to_csv('./Test_Annual_validation_data_segments'+str(files_count)+'.csv', index=False)
df = pd.read_csv('./Test_Annual_validation_data_segments'+str(files_count)+'.csv')


input_df = df
# Tokenize the data using RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

class CustomDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Encode the text and ensure it has the same length by padding or truncating
        input_ids = tokenizer.encode(self.texts.iloc[idx], add_special_tokens=True, padding='max_length', max_length=256, truncation=True)
        return {"input_ids": torch.tensor(input_ids)}  # Convert to tensor

# Create the evaluation dataset
eval_dataset = CustomDataset(input_df['Segment_Text'])

# DataLoaders
eval_dataloader = DataLoader(eval_dataset, batch_size=8)

# Load the pre-trained and fine-tuned model
model_path = "./Roberta_Classification_Model_Fine_Tuned"  # Replace with your actual path
model = RobertaForSequenceClassification.from_pretrained(model_path)
model.eval()

# Move the model to the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Lists to store predictions and confidence scores
predictions = []
confidence_scores = []

with torch.no_grad():
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        # Extract input_ids from the batch
        input_ids = batch["input_ids"].to(device)

        # Get model outputs
        outputs = model(input_ids)
        logits = outputs.logits

        # Calculate softmax to get probabilities
        probabilities = torch.softmax(logits, dim=1)

        # Get predicted labels and confidence scores
        predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
        confidence = probabilities[:, 1].cpu().numpy()

        predictions.extend(predicted_labels)
        confidence_scores.extend(confidence)

# Create a new dataframe with the original data and the model predictions
df_evaluated = input_df.copy()
df_evaluated['Predicted_Label'] = predictions
df_evaluated['Confidence_Score'] = confidence_scores


csv_file_name = './evaluated_results__test_reports_from_complete_model.csv'
df_evaluated.to_csv(csv_file_name, index=False)

evaluated_results_df =  pd.read_csv(csv_file_name)


evaluated_results_df['Predicted_Label*Confidence_Score'] = evaluated_results_df['Predicted_Label'] * evaluated_results_df['Confidence_Score']
# Initialize an empty DataFrame to store the final results
final_summary_df = pd.DataFrame(columns=['Report_File_Path', 'generated_summary'])

# Iterate over unique Report_File_Path values
for report_file_path in evaluated_results_df['Report_File_Path'].unique():
    # Select rows for the current Report_File_Path
    subset_df = evaluated_results_df[evaluated_results_df['Report_File_Path'] == report_file_path].copy()

    # Calculate the rolling sum of the product over 4 consecutive rows
    subset_df['Rolling_Sum'] = subset_df['Predicted_Label*Confidence_Score'].rolling(window=4).sum()

    # Find the starting index of the 4-row window with the maximum sum
    start_index = subset_df['Rolling_Sum'].idxmax()

    # Extract the top 4 consecutive rows
    top_4_rows = subset_df.loc[start_index:start_index + 3]
    list_segments = [i for i in range(start_index, start_index+4 )]
    # Concatenate the results to the final summary DataFrame
    generated_summary = ' '.join(top_4_rows['segments_for_summary'].tolist())
    generated_summary = '. '.join(generated_summary.split('. ')[1:-1])
    generated_summary += '.'
    final_summary_df = pd.concat([final_summary_df, pd.DataFrame({'Report_File_Path': [report_file_path], 'generated_summary': [generated_summary], 'list_segments': [list_segments]})])

# Reset the index of the resulting DataFrame
final_summary_df.reset_index(drop=True, inplace=True)


final_summary_df.to_csv('./Solution1_Validation_Results.csv')


output_path = './system1'

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Iterate over rows in the DataFrame
for index, row in final_summary_df.iterrows():
    # Extract the generated summary and report file path
    generated_summary = row['generated_summary']
    report_file_path = row['Report_File_Path']

    # new filename as specified in ROUGE ReadMe
    file_name = os.path.join(output_path, "fns"+os.path.basename(report_file_path).split('.')[0]+"_"+os.path.basename(report_file_path))

    # Write the generated summary to the text file with the new filename
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(generated_summary)

print("Text files have been created in the 'system1' folder with the specified naming convention.")



Evaluating: 100%|██████████| 171/171 [00:21<00:00,  7.89it/s]


Text files have been created in the 'system1' folder with the specified naming convention.


### ii) Solution 2 : Bi-gram variability score method

In [21]:

evaluated_results_df =  pd.read_csv('./evaluated_results__test_reports_from_complete_model.csv')

candidate_segments = evaluated_results_df[(evaluated_results_df['Confidence_Score'] > 0.75) & (evaluated_results_df['Predicted_Label'] == 1)].copy()

def count_bigrams(segment_text):
    # Tokenize the text into words
    words = nltk.word_tokenize(segment_text)

    # Generate bigrams
    bigrams = list(ngrams(words, 2))

    # Count the occurrences of each unique bigram
    bigram_counts = Counter(bigrams)

    # Use the length to get the count of unique bigrams
    unique_bigram_count = len(bigram_counts)

    return unique_bigram_count




# Initialize an empty DataFrame to store the final results
final_summary_df = pd.DataFrame(columns=['Report_File_Path', 'generated_summary'])

# Iterate over unique Report_File_Path values
for report_file_path in evaluated_results_df['Report_File_Path'].unique():
    # Select rows for the current Report_File_Path
    subset_df = evaluated_results_df[evaluated_results_df['Report_File_Path'] == report_file_path].copy()
    bi_gram_variability_scores = subset_df['Segment_Text'].apply(count_bigrams)
    subset_df['BiGramVariabilityScore'] = bi_gram_variability_scores / bi_gram_variability_scores.sum()

    # Select the top 4 candidates based on bi-gram variability score
    top_candidates = subset_df.nlargest(4, 'BiGramVariabilityScore')

    # Get a list of Segment_No values from the top candidates
    list_of_segment_no = top_candidates['Segment_No'].tolist()

    top_candidates_sorted = top_candidates.sort_values(by='Segment_No')
    sorted_list_of_segment_no = sorted(list_of_segment_no)
    # Combine Segment_Text of the selected segments
    generated_summary = '.'.join(' '.join(top_candidates_sorted['segments_for_summary']).split('.')[1:-1])
    generated_summary += '.'

    final_summary_df = pd.concat([final_summary_df, pd.DataFrame({'Report_File_Path': [report_file_path], 'generated_summary': [generated_summary]})])

# Reset the index of the resulting DataFrame
final_summary_df.reset_index(drop=True, inplace=True)






# Assuming df is your DataFrame
output_path = './system2'

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Iterate over rows in the DataFrame
for index, row in final_summary_df.iterrows():
    # Extract the generated summary and report file path
    generated_summary = row['generated_summary']
    report_file_path = row['Report_File_Path']
    #os.path.basename(report_file_path).split('.')[0] +'_'+
    # Construct the new filename as specified
    file_name = os.path.join(output_path, "fns"+os.path.basename(report_file_path).split('.')[0]+"_"+os.path.basename(report_file_path))

    # Write the generated summary to the text file with the new filename
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(generated_summary)

print("Text files have been created in the 'system2' folder with the specified naming convention.")



Text files have been created in the 'system2' folder with the specified naming convention.


#### We are generating summaries according to the naming convention given by the contest authorities.
#### Copy all the generated summaries from system1 folder -> put it in rouge2_v1.2.2_runnable/v1.2.2/projects/test-summarization/system and in that directory
#### run  command in v1.2.2: java -jar rouge2-1.2.2.jar.
#### This will generate results.csv in v1.2.2, do similarly for system2 summaries as well.

### ROUGE score analysis

In [None]:
f = pd.read_csv('results.csv')

# Separate data for each ROUGE category
rouge_l_data = df[df['ROUGE-Type'] == 'ROUGE-L+StopWordRemoval']
rouge_1_data = df[df['ROUGE-Type'] == 'ROUGE-1+StopWordRemoval']
rouge_2_data = df[df['ROUGE-Type'] == 'ROUGE-2+StopWordRemoval']
rouge_su4_data = df[df['ROUGE-Type'] == 'ROUGE-SU4+StopWordRemoval']

# Calculate averages for each category
avg_rouge_l = rouge_l_data[['Avg_Recall', 'Avg_Precision', 'Avg_F-Score']].mean()
avg_rouge_1 = rouge_1_data[['Avg_Recall', 'Avg_Precision', 'Avg_F-Score']].mean()
avg_rouge_2 = rouge_2_data[['Avg_Recall', 'Avg_Precision', 'Avg_F-Score']].mean()
avg_rouge_su4 = rouge_su4_data[['Avg_Recall', 'Avg_Precision', 'Avg_F-Score']].mean()

# Print the results with labels
print("Average for ROUGE-L:")
print(avg_rouge_l)

print("\nAverage for ROUGE-1:")
print(avg_rouge_1)

print("\nAverage for ROUGE-2:")
print(avg_rouge_2)

print("\nAverage for ROUGE-SU4:")
print(avg_rouge_su4)