### Create dataset on some subsets of DRS data for haystack.deepset.ai
Per https://haystack.deepset.ai/tutorials/08_preprocessing, Haystack expects data to be provided as a list of documents in the following dictionary format. See also https://docs.haystack.deepset.ai/docs/document_store
need also to store the answer and the title

In [1]:
docs = [
    {
        'content': DOCUMENT_TEXT_HERE,
        'meta': {'name': DOCUMENT_NAME, ...}
    }, ...
]

SyntaxError: ':' expected after dictionary key (1725972168.py, line 4)

### Import DRS metadata

In [1]:
import pandas as pd
import numpy as np
import os
import glob

metadata_mapping = pd.read_excel(os.path.join(os.getcwd(),'data','DRS','index','FAA DRS','DRS Document Types Metadata Mapping.xlsx')) # from https://drs.faa.gov/help/helpdetails and https://drs.faa.gov/help/helpdetails

doc_types = pd.unique(metadata_mapping.loc[:, "Document Type Name in API request"])
doc_types = [val for val in doc_types if isinstance(val, str)]

common_fields = []
for n,doc_type in enumerate(doc_types):
    metadata_names = metadata_mapping.loc[metadata_mapping.loc[:, "Document Type Name in API request"]==doc_type, "Metadata Name in API Response "]

    if n>0:
        common_fields = list(set(metadata_names) & set(common_fields))
    else:
        common_fields = list(set(metadata_names))

Subset for which it was possible to split paragraphs

In [2]:
doc_types = ['AC',
            'ORDER_8300.10',
            'ORDER_8400.10',
            'ORDER_8700.1',
            'ORDER_8740.1',
            'ORDER_8900.1',
            'ORDERS']

In [3]:
df_drsmeta = None

for doc_type in doc_types:
    parquet_files = sorted(glob.glob(os.path.join(os.getcwd(),'data', 'DRS', 'index', doc_type+'_202*.parquet')), 
                        key=os.path.getctime, 
                        reverse=True)

    if len(parquet_files)>0:
        this_parquet = pd.read_parquet(parquet_files[0])
        this_parquet.insert(0, 
                            "doc_type", 
                            value=pd.Series(data=doc_type, index=this_parquet.index, dtype="string"))

        if df_drsmeta is None:
            df_drsmeta = this_parquet
        else:
            df_drsmeta = pd.concat([this_parquet,
                                    df_drsmeta])

### Import DRS or non-DRS pre-dataset

In [4]:
source_type = 'DRS'
# source_type = 'NotInDRS'

In [12]:
import pandas as pd
import glob
import os

predatasets = sorted(glob.glob(os.path.join(os.getcwd(), 'data', source_type, 'dataset', 'extracted_text', '*.parquet')), 
                      key=os.path.getctime, 
                      reverse=True)
#df_drs = pd.read_parquet(r"C:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\DRS\extracted_text_for_AC-ORDER_8300.10-ORDER_8400.10-ORDER_8700.1-ORDER_8740.1-ORDER_8900.1-ORDERS-20240104 - 010854.parquet")
print(predatasets[0])
df_predataset = pd.read_parquet(predatasets[0])

df_predataset.tail()

c:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\DRS\dataset\extracted_text\extracted_text_for_AC-ORDER_8300.10-ORDER_8400.10-ORDER_8700.1-ORDER_8740.1-ORDER_8900.1-ORDERS-20240116 - 100107.parquet


Unnamed: 0,documentGuid,position in bookmark tree,original pdf bookmark title (outline.title),original pdf bookmark page number,corrected pdf bookmark page number,extracted title number,extracted title text,first 20 characters after title,nb_keywords for title search
162603,8DE31491B04A26B886257F410064242C,12>9,Other comments:,9,9,e,ther comments:,I would like to discuss the a,5
162604,8DE31491B04A26B886257F410064242C,12>10,I would like to discuss the above. Please cont...,9,9,I,would like to discuss the above.,Please contact me. Submitted,5
162605,8DE31491B04A26B886257F410064242C,12>11,Submitted by: Date:,9,9,e,ubmitted by:,____________________________,2
162606,8DE31491B04A26B886257F410064242C,12>12,FTS Telephone Number: Routing Symbol:,9,9,T,TS Telephone Number:,_____________________ Routi,3
162607,8DE31491B04A26B886257F410064242C,12>13,FAA Form 1320-19 (8-89),9,9,A,AA Form 1,320-19 (8-89),5


### Create dataset that is haystack.deepset friendly

In [13]:
df_dataset = pd.merge(df_predataset, df_drsmeta, on='documentGuid', how='inner').dropna(axis=1, how='all')
df_dataset.insert(loc=0, column='content', value=pd.Series(df_dataset.shape[0]*[''], dtype="string"))
df_dataset.insert(loc=0, column='parents_title_list', value=pd.Series(df_dataset.shape[0]*[''], dtype="string"))

In [14]:
import PyPDF2
import glob
import re
import logging
import datetime

logging.basicConfig(filename='94_make_datasets.log', encoding='utf-8',format='%(asctime)s %(message)s', level=logging.INFO)

def get_parents_title_list(position_in_bookmark_tree, df_dataset):
    parents_title_list = []
    parent_title_number_list = position_in_bookmark_tree.split(">")

    for n in range(1,len(parent_title_number_list)+1):
        matching_index = df_dataset[df_dataset["position in bookmark tree"]==">".join(parent_title_number_list[0:n])].index[0]        
        parents_title_list.append(df_dataset.loc[matching_index, "original pdf bookmark title (outline.title)"])  # extracted title text extracted title number
        
    return str(parents_title_list)


def get_textcontent(pages, index, df_dataset):
    first_page = int(df_dataset.loc[index, 'corrected pdf bookmark page number'])
    try:
        last_page = int(df_dataset.loc[index+1, 'corrected pdf bookmark page number'])
    except Exception as e:
        n=1
        while n<40:
            n = n+1
            if df_dataset.loc[index+n, 'corrected pdf bookmark page number'].isdigit():
                last_page = int(df_dataset.loc[index+n, 'corrected pdf bookmark page number'])
                logging.info("Degraded mode for last_page, use different next index "+ e.args[0] +
                            "\ndocumentGuid="+df_dataset.loc[index+n, "documentGuid"]+
                            "\nposition in bookmark tree="+df_dataset.loc[index+n, "position in bookmark tree"]+
                            "\ntitle="+df_dataset.loc[index+n, "original pdf bookmark title (outline.title)"])            

    textcontent = []

    for n in range(first_page, last_page+1):
        if n==first_page:
            try:
                re_search = re.search(re.escape(df_dataset.loc[index, 'first 20 characters after title']), 
                                    pages[n].extract_text())
                start_textcontent = re_search.span()[0]
            except Exception as e:
                logging.info("Error identifying start_textcontent with "+ e.args[0] +
                             "\ndocumentGuid="+df_dataset.loc[index, "documentGuid"]+
                             "\nfirst_page="+str(first_page)+
                             "\nposition in bookmark tree="+df_dataset.loc[index, "position in bookmark tree"]+
                             "\ntitle="+df_dataset.loc[index, "original pdf bookmark title (outline.title)"])
                start_textcontent = 0
        else:
            start_textcontent = 0

        if n==last_page:
            try:
                re_search = re.search(re.escape(df_dataset.loc[index+1, 'extracted title number'])+
                                    '.{0,5}'+
                                    re.escape(df_dataset.loc[index+1, 'extracted title text']), 
                                    pages[n].extract_text())                                    
                end_textcontent = re_search.span()[0]
            except Exception as e:
                logging.info("Error identifying end_textcontent with "+ e.args[0] +
                             "\ndocumentGuid="+df_dataset.loc[index, "documentGuid"]+
                             "\nlast_page="+str(last_page)+
                             "\nposition in bookmark tree="+df_dataset.loc[index, "position in bookmark tree"]+
                             "\ntitle="+df_dataset.loc[index, "original pdf bookmark title (outline.title)"])
                end_textcontent = None
        else:
            end_textcontent = None

        textcontent.append(pages[n].extract_text()[start_textcontent:end_textcontent])
    
    return textcontent


for index, row in df_dataset.iterrows():
    matching_file = glob.glob(os.path.join(os.getcwd(), 'data', source_type, 'raw data', row["doc_type"], 'pdf','*'+row['documentGuid']+'*.pdf'))
    if (len(matching_file)>0) and (row['drs:status']=='Current'):
        pdf = PyPDF2.PdfReader(matching_file[0])
        df_dataset_filtered = df_dataset[df_dataset["documentGuid"]==row["documentGuid"]]

        try:
            df_dataset.at[index, 'parents_title_list'] = get_parents_title_list(row["position in bookmark tree"],
                                                            df_dataset)        
            df_dataset.at[index, 'content'] = get_textcontent(pdf.pages, 
                                            index, 
                                            df_dataset)
        except Exception as e:
            logging.info("Error with "+ e.args[0] +
                         "\ndocumentGuid="+df_dataset.loc[index, "documentGuid"]+
                         "\nposition in bookmark tree="+df_dataset.loc[index, "position in bookmark tree"]+
                         "\ntitle="+df_dataset.loc[index, "original pdf bookmark title (outline.title)"])
            
    if index % 50000==0:
        df_dataset.to_parquet(os.path.join(os.getcwd(),
                                    'data', 
                                    source_type, 
                                    'dataset (partial'+str(index)+')_for_'+'-'.join(doc_types)+'-'+datetime.datetime.now().strftime("%Y%m%d - %H%M%S")+'.parquet'))


df_dataset.to_parquet(os.path.join(os.getcwd(),
                                   'data', 
                                   source_type, 
                                   'dataset_for_'+'-'.join(doc_types)+'-'+datetime.datetime.now().strftime("%Y%m%d - %H%M%S")+'.parquet'))

  if _pandas_api.is_sparse(col):


KeyboardInterrupt: 

### Some metric
Number of documents total, number of documents per entry, histogram of number of words total, histogram of doc per doc type, nombre de doc pas dl

### Try to find patterns in df_dataset for debugging/improviong the 92_split functions