### Script to generate fine tuning data using MCS similarity method and rhetorical role segmenting

Change the datasets variable according to the requirements

In [None]:
dataset = "IN" # Options: IN, UK 

In [None]:
import pandas as pd
import numpy as np
import glob
import os
import sys
from tqdm import tqdm
sys.path.insert(0, '../')
from utilities import *
# os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
#Reading the documents and summaries 
names, data_source, data_summary = get_summary_data_rhet_train(dataset)
print(len(names))
print(len(data_source))
print(len(data_summary))


In [None]:
def get_doc_sens_and_labels(doc):
    '''
    function to parse rhetorical role labeled documents
    input: doc - Input document
    output: list of sentences, list of labels, dictionary for every sentence and label
    '''
    sents = []
    labels = []
    dict_sents_labels = {}
    ss = doc.split("\n")
    for i in ss:
        try:
            spt = i.split("\t")
            sents.append(spt[0])
            labels.append(spt[1])
            dict_sents_labels[spt[0]] = spt[1] 
        except:
            print(i)
    return sents, labels, dict_sents_labels

In [None]:
def nest_sentencesV2(document_sents,chunk_length):
    '''
    function to chunk the document using document sentences
    input:  document_sents - Input document sentence
            chunk_length - chunk length
    output: list of chunks
    '''
    #modeified v2
    nested = []
    sent = []
    length = 0
    
    for sentence in document_sents:
        length += len((sentence.split(" ")))
        if length < chunk_length:
            sent.append(sentence)
        else:
            nested.append(sent)
            sent = []
            sent.append(sentence)
            length = 0
    if len(sent)>0:
        nested.append(sent)
    return nested

In [None]:
# Loading the similarity Model 
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

sbert_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens').to("cuda:1")

In [None]:
def similarity_l_l(l1, l2):
    '''
    Function to find the most similar sentence in the document for each sentence in the summary 
    input:  l1 - Summary sentences
            l2 - Document sentences
    returns a list of document sentence indexes for each sentence in the summary 
    '''
    document_embeddings = sbert_model.encode(l1+l2)
    similarities=cosine_similarity(document_embeddings)
    
    result = []
    for i in range(len(l1)):
        vals = similarities[i]
        vals = vals[len(l1):]
        idx = np.argmax(vals)
        result.append(idx)
    return result

In [None]:
def nest_sentencesV3(doc_sents,chunk_length, dict_sents_labels):
    '''
    function to first segment the document using rhetorical roles and then chunk if required
    input:  doc_sents           - Input document sentence
            chunk_length        - chunk length
            dict_sents_labels   - dictionary for every sentence and label
    output: list of chunks
    '''
    s = list(set(dict_sents_labels.values()))
#     print(s)
    all_chunks = []
    
    for label in s:
        doc_sents_withlabels = []
        for sent in doc_sents:
            if sent == '':continue
            if dict_sents_labels[sent] == label:
                doc_sents_withlabels.append(sent)
        chunks = nest_sentencesV2(doc_sents_withlabels, chunk_length)
        
        edited_chunks = []
        for chunk in chunks:
            edited_chunks.append(["<" + label + ">"] + chunk)
        
        all_chunks = all_chunks + edited_chunks

    return all_chunks    


In [None]:
chunk_summ_word_threshold = 100

def get_chunks_data_from_docV2(doc, summ):
    '''
    Function to generate chunks along with their summaries 
    input:  doc - legal Document
            summ - Gold standard summary
    returns a list of chunks and their summaries 
    '''
    sentence_mapping = {}
    
    doc_sents, _, dict_sents_labels = get_doc_sens_and_labels(doc)
#     doc_sents = split_to_sentences(doc)
    summ_sents = split_to_sentences(summ)
    
    result = (similarity_l_l(summ_sents,doc_sents))
    
    for i in range(len(summ_sents)):
        sentence_mapping[doc_sents[result[i]]] = summ_sents[i]
    
    final_chunks = []
    final_summ = []
    for chunk in nest_sentencesV3(doc_sents, 1024, dict_sents_labels):
        summ = ""
        for chunk_sent in chunk:
            if chunk_sent in sentence_mapping:
                summ = summ + sentence_mapping[chunk_sent]
        if len((summ.split(" "))) >= chunk_summ_word_threshold:
            final_chunks.append(" ".join(chunk))
            final_summ.append(summ)
    return final_chunks, final_summ

# cks2, summs2 = get_chunks_data_from_docV2(data_source[0],data_summary[0])


In [None]:
#loop to pass every document, generate the fine tuning data and saving in a excel file 
import pandas as pd
training_chunks = []
training_summs = []
for i in tqdm(range(len(data_source))):
    cks, summs = get_chunks_data_from_docV2(data_source[i],data_summary[names[i]])
    training_chunks = training_chunks + cks
    training_summs = training_summs + summs
#     print(i, len(training_summs), end = ", ", sep = " : ")
    if i%100 == 0: 
        full = list(zip(training_chunks,training_summs))
        df = pd.DataFrame(full,columns=['data', 'summary'])
        df.to_excel("FD_"+ dataset + "_RR_MCS_7030_BK.xlsx")
#         break
full = list(zip(training_chunks,training_summs))
df = pd.DataFrame(full,columns=['data', 'summary'])
df.to_excel("FD_"+ dataset +"_RR_MCS_7030_BK.xlsx")