### Section bodies combined by score order top 1000 words

#### Steps:

1. Get the sections which have the scores greater than cutoff 0.744291
2. Extract the body of sections
3. Normalize the scores and adjusted number of words to be picked from the section
4. Maximize the word allocation in case some sections doesn't have required word
4. Extract top k words
5. If no relevant sections -> Write the complete file as summary with top 1000 words
6. Compare the system summary with each gold summary and get the rouge score. Take average of all gold summaries for the respective file.
7. Take average of all the scores for all the files.

In [23]:
import pandas as pd
import pickle
import os
import numpy as np

from extract_section_body import extract_section_body
from rouge_evaluation import get_rouge_scores
from maximal_word_allocation import get_number_of_words

In [24]:
VALIDATION_DATASET = True
TEST_DATASET = False

In [25]:
if VALIDATION_DATASET:
    dir_ = '../../Dataset/FNS2023_Datasets/English/validation'
    toc_loc_pkl_file_path = '../../Dataset/FNS2023_Datasets/English/validation/out/valid_toc_loc.pkl'
    df_predicted_path = '../../DiMSum_FNP_2022/2_Section_Classification/out/validation_df_predicted.pkl'

if TEST_DATASET:
    dir_ = '../../../Dataset/FNS2022/English/testing/'
    toc_loc_pkl_file_path = '../../../Dataset/Annotated_Dataset/test_toc_loc.pkl'
    df_predicted_path = '../../FNP2022/2_Section_Classification/out/test_df_predicted.pkl'

annual_reports_dir = "annual_reports"
gold_summary_dir = "gold_summaries"
system_summary_dir = 'GPT_Summaries'
team_name = 'SSC_AI_RG'
dir_

'../../Dataset/FNS2023_Datasets/English/validation'

In [26]:
df_predicted = pickle.load(open(df_predicted_path, 'rb'))
df_predicted

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary,toc_section_cleaned,pred,False,True
0,30777,Financial and operational highlights,161,22,0,financi oper highlight,1,0.417846,0.582154
1,30777,Strategic report,183,6,1,strateg report,0,0.931694,0.068306
2,30777,Global network,189,11,0,global network,0,0.756982,0.243018
3,30777,Chairman’s statement,200,4,1,chairman statement,1,0.018714,0.981286
4,30777,Chief Executive’s review,204,4,1,chief execut review,1,0.007735,0.992265
...,...,...,...,...,...,...,...,...,...
10547,4162,"to 110,",21914,1104,0,,0,0.980373,0.019627
10548,4162,and 117,23018,1104,0,,0,0.982939,0.017061
10549,4162,to 116,24122,1104,0,,0,0.985174,0.014826
10550,4162,to 122,25226,5440,0,,0,0.984885,0.015115


In [27]:
def get_relevant_sections_with_score(file_id):
    cutoff_score = 0.744291
    df_dict = df_predicted[df_predicted.file_id == int(file_id)][['toc_section', 'True']].to_dict('list')
    section_score_dict = {}
    toc_sections = df_dict['toc_section']
    section_scores = df_dict['True']
    for i in range(len(toc_sections)):
        if section_scores[i] >= cutoff_score:
            section_score_dict[toc_sections[i]] = section_scores[i]
    return section_score_dict

In [28]:
def get_relevant_sections_with_body_len(file_id):
    section_body_len_dict = {}
    section_score_dict = get_relevant_sections_with_score(file_id)
    for section in section_score_dict.keys():
        body = extract_section_body(file_id, section, dir_, annual_reports_dir, toc_loc_pkl_file_path)
        section_body_len_dict[section] = len(body.split(' '))
    return section_body_len_dict

In [29]:
def get_section_number_of_words(file_id):
    section_num_words_dict = {}
    section_score_dict = get_relevant_sections_with_score(file_id)
    sections = list(section_score_dict.keys())
    section_scores = np.array(list(section_score_dict.values()))
    section_body_len_dict = get_relevant_sections_with_body_len(file_id)
    section_body_len = np.array(list(section_body_len_dict.values()))
    prev_num_required_words = np.zeros(len(section_body_len))
    num_words = get_number_of_words(section_scores, section_body_len, 1000, prev_num_required_words)
    for i in range(len(sections)):
        section_num_words_dict[sections[i]] = int(num_words[i])
    return section_num_words_dict

In [None]:
num_file = 0
# Create a directory for the system-generated summaries
# os.makedirs(system_summary_dir)

# Iterate through files in the annual_reports_dir
for file in os.listdir(os.path.join(dir_, annual_reports_dir)):
    try:
        print("Processing File Number: ", num_file)
        num_file = num_file + 1
        
        # Extract the file_id from the filename
        file_id = file.split('.')[0]
        
        # Get relevant sections and their scores for the file
        relevant_sections_with_score = get_relevant_sections_with_score(file_id)
        
        # Section order is maintained
        relevant_sections = list(relevant_sections_with_score.keys())
        
        # Get the number of words in each section
        section_num_words_dict = get_section_number_of_words(file_id)
        
        summary = ""
        total_number_of_words_in_body = 0
        total_number_of_words_in_summary = 0
        
        print(file_id, relevant_sections, section_num_words_dict)
        
        if relevant_sections:
            print('Relevant Section Found in ', file_id)
            
            # Iterate through relevant sections
            for section in relevant_sections:
                number_of_words_to_be_extracted = section_num_words_dict[section]
                
                # Extract the body of the section
                section_body = extract_section_body(file_id, section, dir_, annual_reports_dir, toc_loc_pkl_file_path)
                section_body_split = section_body.split(' ')
                print(section_body_split)
                number_of_words_in_body = len(section_body_split)
                total_number_of_words_in_body = total_number_of_words_in_body + number_of_words_in_body
                
                # Generate the summary based on word limits
                if number_of_words_in_body > number_of_words_to_be_extracted:
                    summary = summary + " ".join(section_body_split[:number_of_words_to_be_extracted])
                    total_number_of_words_in_summary = total_number_of_words_in_summary + number_of_words_to_be_extracted
                else:
                    print(file_id, section, number_of_words_in_body, number_of_words_to_be_extracted)
                    summary = summary + " ".join(section_body_split[:number_of_words_in_body])
                    total_number_of_words_in_summary = total_number_of_words_in_summary + number_of_words_in_body
            
            print(file_id, 'number_of_words_in_output_summary', total_number_of_words_in_summary)    
            print(file_id, 'number_of_words_in_body', total_number_of_words_in_body)
            print('\n')
        else:
            print('Relevant Section Not Found in ', file_id)
            
            # Read the entire summary if no relevant sections are found
            summary = open(os.path.join(dir_, annual_reports_dir, file), "r", encoding="utf-8").read()
            summary_split = summary.split(' ')
            number_of_words = len(summary_split)
            
            # Generate a shorter summary if the original summary is too long
            if number_of_words > 1000:
                summary = " ".join(summary_split[:1000])
        
        # Write the generated summary to a text file
        with open(os.path.join(system_summary_dir, file_id + '_' + team_name + '.txt'), 'w', encoding='utf-8') as f:
            f.write(str(summary))
        
        # Skip processing if the file is ".DS_Store"
        if ".DS_Store" in file:
            continue
    except Exception as e:
        print(file, e)


In [None]:
if VALIDATION_DATASET:
    gold_summary_dir_ =  os.path.join(dir_, gold_summary_dir)
    rouge_scores = get_rouge_scores(system_summary_dir, gold_summary_dir_)
    rouge_scores

In [None]:
rouge_scores

#### Langchain Implementation

##### Short note
- get_relevant_sections_with_body_len
- get_section_number_of_words
- get_relevant_sections_with_score
- We are using all the aboove functions to retrieve the necessary parameters to send as input to GPT



In [30]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import AzureChatOpenAI

import os
import pandas as pd

In [31]:
USE_GPT_3_5_TURBO = False
USE_GPT_4 = True

# Access - Config 
os.environ["OPENAI_API_KEY"] = "be51f10009fa41258fcd750a2fba07f2"
os.environ["OPENAI_API_BASE"] = "https://openai-ss.openai.azure.com/"
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"

# Assign model 
if USE_GPT_4:
    model = AzureChatOpenAI(temperature=0,deployment_name="ss-gpt-32k")
elif USE_GPT_3_5_TURBO:
    model = AzureChatOpenAI(temperature=0,deployment_name="ss-gpt")
else:
    raise Exception('Model not supported.')
    
model

AzureChatOpenAI(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, client=<class 'openai.api_resources.chat_completion.ChatCompletion'>, model_name='gpt-3.5-turbo', temperature=0.0, model_kwargs={}, openai_api_key='be51f10009fa41258fcd750a2fba07f2', openai_api_base='https://openai-ss.openai.azure.com/', openai_organization='', openai_proxy='', request_timeout=None, max_retries=6, streaming=False, n=1, max_tokens=None, tiktoken_model_name=None, deployment_name='ss-gpt-32k', openai_api_type='azure', openai_api_version='2023-03-15-preview')

In [32]:
# Updated response schema
response_schemas = [
    ResponseSchema(name="Summary", description="summary of the section name from the section content around specified words"),
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [33]:
## zero shot prompt tempelate
zero_shot_template = """
The task is to summarize the section "{section_name}" with the following content: "{section_content}"

Instructions:
- Create a summary of approximately {number_of_words} words.
- Ensure that the output language matches the input language.
- The summary should be a coherent and complete paragraph.

Format Instructions:
{format_instructions}
"""

In [34]:
def build_prompt():
    format_instructions = output_parser.get_format_instructions()
    
    print('Zero shot setting')
    prompt_template = zero_shot_template
    input_vars = ["section_name", "section_content","number_of_words"]
        
    prompt = ChatPromptTemplate(
    messages=[
            HumanMessagePromptTemplate.from_template(prompt_template)
    ],
        input_variables=input_vars,
        partial_variables={"format_instructions": format_instructions}
    )
    return prompt

In [35]:
prompt = build_prompt()

Zero shot setting


In [36]:
def extract_summary(section_name, section_content, number_of_words, prompt):
    # Format the input variables into the prompt
    _input = prompt.format_prompt(section_name=section_name, section_content=section_content, number_of_words=number_of_words)
    
    # Generate model output using the formatted input
    output = model(_input.to_messages())
    
    # Parse the model's response using the output_parser
    response = output_parser.parse(output.content)
    
    # Return the parsed response (summary)
    return response


In [None]:
import pandas as pd

# Create an empty list to store the rows
summary_data = []
os.makedirs(system_summary_dir)
num_files_to_process = 300  # Change this to the desired number of files to process

num_file = 0

# Iterate through files in the annual_reports_dir
for file in os.listdir(os.path.join(dir_, annual_reports_dir)):
    try:
        if num_file >= num_files_to_process:
            break
        
        print("Processing File Number: ", num_file)
        num_file = num_file + 1
        
        # Extract the file_id and file_name from the filename
        file_id = file.split('.')[0]
        file_name = file
        
        # Get relevant sections and their scores for the file
        relevant_sections_with_score = get_relevant_sections_with_score(file_id)
        
        # Section order is maintained
        relevant_sections = list(relevant_sections_with_score.keys())
        
        # Get the number of words in each section
        section_num_words_dict = get_section_number_of_words(file_id)
        

        if relevant_sections:
    
            # Iterate through relevant sections
            for section in relevant_sections:
                
                number_of_words_to_be_extracted = section_num_words_dict[section]
                section_body = extract_section_body(file_id, section, dir_, annual_reports_dir, toc_loc_pkl_file_path)
                
                summary_resp = extract_summary(section, section_body, number_of_words_to_be_extracted, prompt)
                summary = summary + summary_resp['Summary']
                
                # Create a dictionary for the row and add it to the list
                row = {
                    'file_id': file_id,
                    'file_name': file_name,
                    'narrative_section_name': section,
                    'narrative_section_body': section_body,
                    'num_words_to_be_extracted': number_of_words_to_be_extracted,
                    'summary_generated': summary_resp['Summary']
                }
                summary_data.append(row)
            
        else:
            print('Relevant Section Not Found in ', file_id)
            
            # Read the entire summary if no relevant sections are found
            summary = open(os.path.join(dir_, annual_reports_dir, file), "r", encoding="utf-8").read()
            summary_split = summary.split(' ')
            number_of_words = len(summary_split)
            
            # Generate a shorter summary if the original summary is too long
            if number_of_words > 1000:
                summary = " ".join(summary_split[:1000])
            
            # Create a dictionary for the row and add it to the list
            row = {
                'file_id': file_id,
                'file_name': file_name,
                'narrative_section_name': '',
                'narrative_section_body': '',
                'num_words_to_be_extracted': '',
                'summary_generated': summary
            }
            summary_data.append(row)
        
        with open(os.path.join(system_summary_dir, file_id + '_' + team_name + '.txt'), 'w', encoding='utf-8') as f:
            f.write(str(summary))
            if relevant_sections:
                row = {
                    'file_id': file_id,
                    'file_name': file_name,
                    'narrative_section_name': 'Final Summary',
                    'narrative_section_body': '',
                    'num_words_to_be_extracted': '',
                    'summary_generated': summary
                }
                summary_data.append(row)
            
        # Skip processing if the file is ".DS_Store"
        if ".DS_Store" in file:
            continue
    except Exception as e:
        print(file, e)


summary_df = pd.DataFrame(summary_data)

summary_df

Processing File Number:  0


In [46]:
if VALIDATION_DATASET:
    gold_summary_dir_ =  os.path.join(dir_, gold_summary_dir)
    rouge_scores = get_rouge_scores(system_summary_dir, gold_summary_dir_)
    print(rouge_scores)

Processing File Number:  0
Number of files processed:  2
{'rouge-1': {'p': 0.21146759305789697, 'r': 0.699810686914558, 'f': 0.3241954142965857}, 'rouge-2': {'p': 0.07484537285120137, 'r': 0.2416174894642381, 'f': 0.11410563001194785}}
