### Section bodies combined by score order top 1000 words

#### Steps:

1. Get the relevant sections and their position in the report
2. Order the sections by relevance score
3. Extract the body of section and keep on concatenating
4. Extract top 1000 words
5. If no relevant sections -> Write the complete file as summary with top 1000 words
6. Compare the system summary with each gold summary and get the rouge score. Take average of all gold summaries for the respective file.
7. Take average of all the scores for all the files.

In [1]:
import pandas as pd
import pickle
import os

from extract_section_body import extract_section_body
from rouge_evaluation import get_rouge_scores

In [2]:
VALIDATION_DATASET = False
TEST_DATASET = True

In [3]:
if VALIDATION_DATASET:
    dir_ = '../../../Dataset/FNS2022/English/validation/'
    toc_loc_pkl_file_path = '../../../Dataset/Annotated_Dataset/valid_toc_loc.pkl'
    df_predicted_path = '../../FNP2022/2_Section_Classification/out/validation_df_predicted.pkl'

if TEST_DATASET:
    dir_ = '../../../Dataset/FNS2022/English/testing/'
    toc_loc_pkl_file_path = '../../../Dataset/Annotated_Dataset/test_toc_loc.pkl'
    df_predicted_path = '../../FNP2022/2_Section_Classification/out/test_df_predicted.pkl'

annual_reports_dir = "annual_reports"
gold_summary_dir = "gold_summaries"
system_summary_dir = 'section_combined_by_score_top_1000'
team_name = 'SSC_AI_RG'
dir_

'../../../Dataset/FNS2022/English/testing/'

In [4]:
df_predicted = pickle.load(open(df_predicted_path, 'rb'))
df_predicted

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary,toc_section_cleaned,pred,False,True
0,25082,Highlights 2011,45,37,0,highlight,1,0.005472,0.994528
1,25082,Impax Facts,82,57,0,impax fact,0,0.754336,0.245664
2,25082,Chairman’s Statement,139,273,0,chairman statement,1,0.018617,0.981383
3,25082,Our history,412,41,0,histori,0,0.766789,0.233211
4,25082,Chief Executive’s Report,453,371,0,chief execut report,1,0.041504,0.958496
...,...,...,...,...,...,...,...,...,...
12348,30829,Notes to the Company financial statements,10454,348,0,note compani financi statement,0,0.996484,0.003516
12349,30829,Glossary,10802,70,0,glossari,0,0.937217,0.062783
12350,30829,Five year record,10872,42,0,five year record,0,0.950753,0.049247
12351,30829,Shareholder services,10914,42,0,sharehold servic,0,0.907658,0.092342


In [5]:
def get_relevant_sections_with_score(file_id):
    df_dict = df_predicted[(df_predicted.file_id == int(file_id)) & (df_predicted.pred == 1)][['toc_section', 'True']].to_dict('list')
    section_score_dict = {}
    toc_sections = df_dict['toc_section']
    section_scores = df_dict['True']
    for i in range(len(toc_sections)):
        section_score_dict[toc_sections[i]] = section_scores[i]
    
    result_dict = {k: v for k, v in sorted(section_score_dict.items(), key=lambda item: item[1], reverse=True)}
    releavnt_sections = list(result_dict.keys())
    return releavnt_sections, result_dict

In [6]:
num_file = 0
os.makedirs(system_summary_dir)
for file in os.listdir(os.path.join(dir_, annual_reports_dir)):
    try:
        #print("Processing File Number: ", num_file)
        num_file = num_file +1 
        file_id = file.split('.')[0]
        relevant_sections_with_score = get_relevant_sections_with_score(file_id)
        # Section order is maintained
        relevant_sections = relevant_sections_with_score[0]
        #print(file_id, relevant_sections)
        summary = ""
        if relevant_sections:
            #print('Relevant Section Found in ', file_id)
            for section in relevant_sections:
                summary = summary + " " +extract_section_body(file_id, section, dir_, annual_reports_dir, toc_loc_pkl_file_path)
            summary_split = summary.split(' ')
            number_of_words = len(summary_split)
            #print(number_of_words)
            if number_of_words > 1000:
                summary = " ".join(summary_split[:1000])
    
        else:
            #print('Relevant Section Not Found in ', file_id)
            summary = open(os.path.join(dir_, annual_reports_dir, file), "r", encoding="utf-8").read()
            summary_split = summary.split(' ')
            number_of_words = len(summary_split)
            #print(number_of_words)
            if number_of_words > 1000:
                summary = " ".join(summary_split[:1000])
        
        with open(os.path.join(system_summary_dir, file_id+'_'+team_name+'.txt'), 'w', encoding='utf-8') as f:
            f.write(str(summary))
            
        if ".DS_Store" in file:
            continue
    except Exception as e:
        print(file, e)

In [7]:
if VALIDATION_DATASET:
    gold_summary_dir_ =  os.path.join(dir_, gold_summary_dir)
    rouge_scores = get_rouge_scores(system_summary_dir, gold_summary_dir_)
    rouge_scores

Processing File Number:  0
Processing File Number:  50
Processing File Number:  100
Processing File Number:  150
Processing File Number:  200
Processing File Number:  250
Processing File Number:  300
Processing File Number:  350
Number of files processed:  363


In [8]:
rouge_scores

{'rouge-1': {'p': 0.49870924674830447,
  'r': 0.541012062251896,
  'f': 0.4777744701811268},
 'rouge-2': {'p': 0.2968354194694265,
  'r': 0.31287322189947175,
  'f': 0.2812818094019499}}