In [3]:
from py_pdf_parser.loaders import load_file
from PyPDF2 import PdfReader
from dotenv import load_dotenv
import os
import re
import json
import csv
import camelot
from llama_parse import LlamaParse
import nest_asyncio
nest_asyncio.apply()


class TextExtractor:
    def __init__(self, config):
        self.config = config
        self.output_folder = config.get('output_folder', 'output')

    def get_text_parsing_module(self):
        if self.config['text_parsing'] == 'pypdfparser':
            return self.parse_text_with_pypdfparser
        elif self.config['text_parsing'] == 'pypdf2':
            return self.parse_text_with_pypdf2

    def parse_text_with_pypdfparser(self, file_path):
        try:
            FONT_MAPPING = {
                r"\w{6}\+TrebuchetMS-Bold,18\.0": "title",
                r"\w{6}\+TrebuchetMS-Bold,18\.0": "title",
                r"\w{6}\+Calibri-Bold,24\.0": "title",
                r"\w{6}\+Calibri-Bold,28\.0": "title",
                r"\w{6}\+Calibri-Bold,13\.9": "subheading",
                r"\w{6}\+Calibri-Bold,14\.0": "subheading",
                r"\w{6}\+Calibri-Bold,16\.0": "subheading",
                r"\w{6}\+Calibri-Bold,16\.6": "subheading",
                r"\w{6}\+Calibri-Bold,12\.0": "nested_subheading",
                r"\w{6}\+Calibri-Bold,13\.0": "nested_subheading",
                r"\w{6}\+Calibri,12\.0": "content",
                r"\w{6}\+Calibri-BoldItalic,12\.0": "content",
                r"\w{6}\+Calibri-Italic,12\.0": "content",
                r"\w{6}\+Calibri-Italic,11\.0": "content",
                r"ArialMT,8\.0": "ignored"
            }

            filenames = os.listdir(file_path)
            for filename in filenames:
                x = file_path + "/"+filename
                document = load_file(
                    x, font_mapping=FONT_MAPPING, font_mapping_is_regex=True)
                data = {}
                title_policy = []
                current_subheading = None
                current_nested_subheading = None

                for element in document.elements:
                    try:
                        if element.filter_by_font("ignored"):
                            continue
                        elif element.filter_by_font("title"):
                            title_policy.append(element.text().strip())
                        elif element.filter_by_font("subheading"):
                            current_subheading = element.text().strip()
                            current_nested_subheading = None
                            data[current_subheading] = {
                                "content": "", "nested": {}}
                        elif element.filter_by_font("nested_subheading"):
                            if current_subheading:
                                current_nested_subheading = element.text().strip()
                                data[current_subheading]["nested"][current_nested_subheading] = ""
                        elif element.filter_by_font("content"):
                            if current_nested_subheading:
                                data[current_subheading]["nested"][current_nested_subheading] += " " + \
                                    element.text().strip()
                            elif current_subheading:
                                data[current_subheading]["content"] += " " + \
                                    element.text().strip()
                    except Exception as e:
                        print(f"Skipping element due to exception: {str(e)}")
                output_data = {
                    "title_policy": title_policy,
                    "content": data
                }
                os.makedirs(self.output_folder, exist_ok=True)
                output_file_path = os.path.join(
                    self.output_folder, f"{os.path.basename(file_path)}.json")

                with open(output_file_path, "w") as json_file:
                    json.dump(output_data, json_file, indent=4)

                return f"File saved successfully at {output_file_path}"

        except Exception as e:
            print(f"Error : {str(e)}")
            return None

    def parse_text_with_pypdf2(self, file_path):
        try:
            os.makedirs(self.output_folder, exist_ok=True)

            filenames = os.listdir(file_path)
            for filename in filenames:
                input_file_path = os.path.join(file_path, filename)
                document = PdfReader(input_file_path)
                num_pages = len(document.pages)

                output_file_name = f"{os.path.splitext(filename)[0]}.txt"
                output_file_path = os.path.join(
                    self.output_folder, output_file_name)

                with open(output_file_path, "w", encoding='utf-8') as output_file:
                    text = ''
                    for page in document.pages:
                        text += page.extract_text()
                    output_file.write(text)

            return f"All files processed and saved in {self.output_folder}"
        except Exception as e:
            print(f"Error processing files with PyPDF2: {str(e)}")
            return None


class TableExtractor:
    def __init__(self, config):
        self.config = config
        self.output_folder = config.get('output_folder', 'output_tables')
        load_dotenv()

    def get_table_parsing_module(self):
        if self.config['table_parsing'] == 'camelot':
            return self.parse_tables_with_camelot
        elif self.config['table_parsing'] == 'llm':
            return self.parse_tables_with_llm

    def parse_tables_with_camelot(self, file_path):
        os.makedirs(self.output_folder, exist_ok=True)

        tables = camelot.read_pdf(
            file_path, pages='all', flavor="lattice", suppress_stdout=True)
        print(f"Total tables extracted: {len(tables)}")

        csv_dir = os.path.join(self.output_folder, 'csv_files')
        os.makedirs(csv_dir, exist_ok=True)
        for i, table in enumerate(tables):
            output_path = os.path.join(csv_dir, f'table_{i}.csv')
            table.to_csv(output_path)
        tables_dict = self.export_csv_tables_to_dict(csv_dir)
        json_output_path = os.path.join(
            self.output_folder, 'tables_output.json')
        self.save_dict_to_json(tables_dict, json_output_path)
        print(f"Tables saved as JSON in {json_output_path}")

        return json_output_path

    def csv_to_dict(self, file_path):
        table_dict = {}
        with open(file_path, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                for column, value in row.items():
                    if column not in table_dict:
                        table_dict[column] = []
                    table_dict[column].append(value)
        # Convert lists to comma-separated strings
        for column in table_dict:
            table_dict[column] = ','.join(table_dict[column])
        return table_dict

    def export_csv_tables_to_dict(self, directory):
        tables_dict = {}
        table_number = 1
        for filename in os.listdir(directory):
            if filename.endswith('.csv'):
                file_path = os.path.join(directory, filename)
                tables_dict[table_number] = self.csv_to_dict(file_path)
                table_number += 1
        return tables_dict

    def save_dict_to_json(self, data, file_path):
        with open(file_path, 'w') as json_file:
            json.dump(data, json_file, indent=4)

    def parse_tables_with_llm(self, file_path):
        os.makedirs(self.output_folder, exist_ok=True)

        api_key = os.getenv("LLAMA_CLOUD_API_KEY")
        if not api_key:
            raise ValueError(
                "LLAMA_CLOUD_API_KEY environment variable is not set")
        document = LlamaParse(result_type="markdown").load_data(file_path)
        output_file = os.path.join(self.output_folder, 'llm_parsed_tables.txt')
        with open(output_file, 'w', encoding='utf-8') as file:
            for i in range(len(document)):
                excerpt = document[i].text[:1000]
                file.write(f"Document {i+1}:\n")
                file.write(excerpt)
                # Separator between documents
                file.write("\n\n" + "-"*50 + "\n\n")
            print(f"Tables have been saved to {output_file}")
        return output_file


class QAPairBuilder:
    def __init__(self, config):
        self.config = config

    def build_qa_pairs(self, parsed_text, parsed_tables):
        return [("Sample Question", "Sample Answer")]


class ModelFineTuner:
    def __init__(self, config):
        self.config = config

    def fine_tune_model(self, qa_pairs):
        print(f"Fine-tuning model with {len(qa_pairs)} Q&A pairs")


class DTDLBotPipeline:
    def __init__(self, config):
        self.config = config
        self.text_extractor = TextExtractor(config)
        self.table_extractor = TableExtractor(config)
        self.qa_pair_builder = QAPairBuilder(config)
        self.model_fine_tuner = ModelFineTuner(config)

    def run(self):
        try:
            parsed_text = self.text_extractor.get_text_parsing_module()
            parsed_tables = self.table_extractor.get_table_parsing_module()
            qa_pairs = self.qa_pair_builder.build_qa_pairs(
                parsed_text, parsed_tables)
            self.model_fine_tuner.fine_tune_model(qa_pairs)
            print(f"Pipeline run successfully with config: {self.config}")
        except Exception as e:
            print(f"Error running pipeline: {str(e)}")


# Usage
config = {
    'pdf_path': 'path/to/your/pdf',
    'text_parsing': 'pypdf2',
    'table_parsing': 'camelot'
}

pipeline = DTDLBotPipeline(config)
pipeline.run()

Error running pipeline: name 'data' is not defined
