In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import faiss
import re
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from uuid import uuid4
from langchain_community.document_loaders import CSVLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_nomic import NomicEmbeddings
from langchain.schema import Document
from typing import List, Union, Tuple, Dict
from langchain_ollama import OllamaLLM
from langchain.prompts import ChatPromptTemplate


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_community.embeddings.ollama import OllamaEmbeddings


In [2]:
directory = 'C:/Users/skrge/Documents/GitHub/llmtesting/data'

In [84]:
def load_csv_files(directory: str) -> List[str]:
    """
    Load and return the content of all CSV files in the given directory.
    """
    documents = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".csv"):
            file_path = os.path.join(directory, file_name)
            loader = CSVLoader(file_path)
            documents.extend(loader.load())
    return documents

def load_csv_files(directory: str) -> List[Document]:
    """
    Load and return the content of all CSV files in the given directory.
    """
    documents = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".csv"):
            file_path = os.path.join(directory, file_name)
            loader = CSVLoader(file_path)
            documents.extend(loader.load())
    return documents

def split_docs(documents: List[Document], chunk_size: int = 400, chunk_overlap: int = 40) -> List[Document]:
    """
    Split documents into chunks using RecursiveCharacterTextSplitter.
    
    Args:
        documents (List[Document]): List of Document objects to be split.
        chunk_size (int): Maximum size of each chunk.
        chunk_overlap (int): Overlap size between chunks.

    Returns:
        List[Document]: List of split Document objects.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def remove_garbage_lines(text: str) -> str:
    """
    Removes lines that contain mostly numbers, standalone letters, or patterns like 'B = B', 'M = M'.
    """
    cleaned_lines = []
    
    for line in text.split("\n"):
        line = line.strip()
        
        # Skip lines that are mostly numbers, letters with =, or repeating patterns
        if re.match(r'^([\d\s]+|[A-Z]\s*=\s*[A-Z]\s*)+$', line):
            continue
        
        # Skip lines with excessive letter-number-symbol sequences (like slurB B B 0 B B)
        if re.search(r'(slurB|B\s*=\s*B|M\s*=\s*M|Y\s*=\s*Y|X\s*=\s*X|Z\s*=\s*Z)', line):
            continue
        
        cleaned_lines.append(line)
    
    return "\n".join(cleaned_lines)

def load_pdf_files(directory: str) -> List[Document]:
    """
    Load and return the content of all PDF files in the given directory.
    """
    documents = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".pdf"):
            file_path = os.path.join(directory, file_name)
            loader = PyPDFLoader(file_path)
            pdf_docs = loader.load()
            
            for doc in pdf_docs:
                doc.page_content = remove_garbage_lines(doc.page_content)  # Clean extracted text

            documents.extend(pdf_docs)
    return documents

def upload_files(directory: str) -> List[Document]:
    """
    Upload all supported file types from a given directory, split PDF content into chunks, and return their content.
    """
    supported_loaders = {
        "csv": load_csv_files,
        "pdf": load_pdf_files
    }
    documents = []

    for ext, loader_func in supported_loaders.items():
        loaded_documents = loader_func(directory)
        if ext == "pdf":
            documents.extend(split_docs(loaded_documents))  # Split PDFs into chunks
        else:
            documents.extend(loaded_documents)
    
    return documents

In [5]:
def generate_chunk_id(doc: Document, current_page: int = None, current_page_part: int = 0) -> Tuple[str, int, int]:
    """
    Generate a unique ID for a chunk based on the Document's metadata.

    Args:
        doc (Document): A langchain Document object containing page content and metadata.
        current_page (int, optional): Current page number for PDF chunks. Default is None.
        current_page_part (int, optional): Current part number for the current page. Default is 0.

    Returns:
        Tuple[str, int, int]: Generated chunk ID, updated current_page, and updated current_page_part.
    """
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    metadata = doc.metadata

    # Extract only the file name from the full path
    file_name = os.path.basename(metadata.get("source", "unknown"))

    if "row" in metadata:  # For CSV files
        chunk_id = f"{file_name}_row{metadata['row']}_{current_time}"
    
    elif "page" in metadata:  # For PDF files
        page = metadata["page"]

        # Update page_part if current_page is the same, otherwise reset it
        if current_page == page:
            current_page_part += 1
        else:
            current_page = page
            current_page_part = 0

        chunk_id = f"{file_name}_page{page}:part{current_page_part}_{current_time}"
    
    else:
        chunk_id = f"{file_name}_{current_time}"  # Fallback for unknown formats

    return chunk_id, current_page, current_page_part

In [46]:
"""
current_page = None
current_page_part = 0
for doc in all_docs:
    chunk_id, current_page, current_page_part = generate_chunk_id(doc, current_page, current_page_part)
"""

In [6]:
def process_documents(all_docs: List[Document]) -> List[Document]:
    """
    Process a list of documents and generate chunk IDs with metadata.

    Args:
        all_docs (List[Document]): List of langchain Document objects.

    Returns:
        List[Document]: List of processed Document objects with chunk IDs.
    """
    current_page = None
    current_page_part = 0
    processed_docs = []

    for doc in all_docs:
        chunk_id, current_page, current_page_part = generate_chunk_id(doc, current_page, current_page_part)
        
        # Add chunk_id to the document's metadata
        new_metadata = doc.metadata.copy()
        new_metadata['chunk_id'] = chunk_id
        
        # Create a new Document with the updated metadata
        processed_doc = Document(
            metadata=new_metadata,
            page_content=doc.page_content
        )
        
        processed_docs.append(processed_doc)

    return processed_docs


In [85]:
# Example usage
test_directory = 'C:/Users/skrge/Documents/GitHub/llmtesting/data/test/test'
pdf_docs = upload_files(test_directory)
proc_docs = process_documents(pdf_docs)

In [7]:
def faiss_db(processed_docs):
    embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")

    # Create an empty FAISS index with the appropriate embedding dimension
    index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

    # Initialize FAISS vector store with an in-memory document store
    vector_store = FAISS(
        embedding_function=embeddings,  # The function to generate embeddings
        index=index,  # The FAISS index to store vectors
        docstore=InMemoryDocstore(),  # Stores original documents
        index_to_docstore_id={}  # Mapping between FAISS index positions and document IDs
    )

    # Generate unique IDs for each document
    uuids = [str(uuid4()) for _ in range(len(processed_docs))]

    # Add documents to the vector store with generated UUIDs
    vector_store.add_documents(documents=processed_docs, ids=uuids)

    # Return the vector store with stored documents
    return vector_store

In [46]:
vector_store = faiss_db(processed_docs)

In [8]:
output_dir = r"C:\Users\skrge\Documents\GitHub\llmtesting\output"

def save_faiss_vector_store(vector_store, output_dir):
    #Saves the FAISS vector store to the specified directory.
    
    # Ensure the directory exists
    os.makedirs(output_dir , exist_ok=True)

    # Save the FAISS vector store
    vector_store.save_local(output_dir )
    print(f"FAISS index saved at: {output_dir }")


In [27]:
save_faiss_vector_store(vector_store, output_dir)

FAISS index saved at: C:\Users\skrge\Documents\GitHub\llmtesting\output


In [9]:
def load_faiss_vector_store(output_dir):

    #Loads a FAISS vector store from the specified directory.

    embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")

    # Check if the FAISS index exists before loading
    if not os.path.exists(os.path.join(output_dir, "index.faiss")):
        raise FileNotFoundError(f"No FAISS index found at: {output_dir}")

    # Load the FAISS vector store with safe pickle deserialization
    vector_store = FAISS.load_local(
        output_dir, 
        embeddings, 
        allow_dangerous_deserialization=True  # Allows pickle loading
    )

    print(f"FAISS index loaded from: {output_dir}")
    return vector_store

In [50]:
vector_store = load_faiss_vector_store(output_dir)

FAISS index loaded from: C:\Users\skrge\Documents\GitHub\llmtesting\output


In [10]:
PROMPT_TEMPLATE = """
Answer the {question} based only on the following context:

{context}
"""

In [11]:
def query_rag_chat(query_text: str, vector_store):
    """
    Searches the FAISS vector store and generates a response using retrieved documents.
    """
    # Perform similarity search in the existing vector store
    results = vector_store.similarity_search(query_text, k=5)

    # If no relevant documents are found, return a message
    if not results:
        print("No relevant context found.")
        return None, "No relevant context found.", []

    # Extract text from retrieved documents
    context = " ".join([doc.page_content for doc in results])

    # Format the prompt
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context, question=query_text)

    # Initialize the Ollama model for generating answers
    generation_model = OllamaLLM(model="llama3.1")

    # Generate response
    response = generation_model.invoke(prompt)

    # Extract document sources
    sources = [doc.metadata.get("chunk_id") for doc in results]
    #print(response, sources, context)
    return response, context, sources

In [55]:
query_text = "What books did write George Seyfried?"
    
# Perform the RAG query
query_rag_chat(query_text, vector_store)

('Based on the provided context, George Seyfried wrote "Disaster at the Arch", a book about a boy\'s survival story after the Gateway Arch in St. Louis catches fire and separates him from his family.',
 ': 7\nBook: Disaster at the Arch\nAuthor: George Seyfried\nDescription: It could have been such a great trip. James Smith is really excited for summer vacation. He is going to St. Louis! After a fun week in the Gateway to the West they finally decide to go to the top of the Arch. There\'s just one problem. The Arch catches on fire. James gets separated from his family and has to survive on astronaut food and other gift-shop items. Will he make it out of the Arch? Will anyone?George Seyfried was ten when he wrote this book. He lives in Southport, Connecticut with his parents and sister and brother. He loves history, geography, traveling, and goes on cool road trips with his dad every year. He has been to four countries and thirty states and has a goal of visiting all fifty states before 

In [50]:
book_title_template = """
    You are an expert in extracting book titles from structured lists.
    Please extract only the book titles and return in this format:  

    ["Full_title_1",
    "Full_title_2",
        ...,
    "Full_title_N"]

    Rules:
    - The book title always starts with an upper letter.
    - Titles can be placed on few lines in a row.
    - Preserve the entire title, including parts before and after `:` if present.
    - Preserve the full title, including subtitles and special characters.
    - Do NOT add any extra text, explanations, or formatting.

    Context:
    {context}

    """

In [128]:
import json
from langchain_core.prompts import PromptTemplate

def extract_book_title(documents: List[Document]):
       
    book_info_template = """
    You are an expert in extracting structured book information from formatted lists.
    Please extract the details for each book and return them in this format:

    
    ["title_info_1",
    "title_info_2",
    ...,
    "title_info_n"]

    Rules:
    - The book title always starts with an upper letter.
    - Titles can be placed on few lines in a row.
    - Preserve the entire title, including parts before and after `:` if present.
    - Preserve the full title, including subtitles and special characters.
    - Do NOT add any extra text, explanations, or formatting.
    - Stop extraction before ISBN or any price or sales information ISBN(e.g., Euro 38,80).


    Context:
    {context}
    """
    
    # Combine content from all documents
    context = " ".join([doc.page_content for doc in documents])
    
    # Format the prompt
    book_info_template = PromptTemplate(template=book_info_template, input_variables=["context"])
    prompt_title = book_info_template.format(context=context)
    
    # Initialize LLM
    generation_model = OllamaLLM(model="llama3.1")
    
    # Generate response
    response = generation_model.invoke(prompt_title)
 
    return response

In [119]:
test_directory = 'C:/Users/skrge/Documents/GitHub/llmtesting/data/test/test'
pdf_docs = upload_files(test_directory)
proc_docs = process_documents(pdf_docs)

In [120]:
len(pdf_docs)

49

In [25]:
from collections import defaultdict

def group_documents_by_source_and_page(documents):
    """
    Groups a list of langchain Document objects by their source and page.
    Args:
        documents (list): A list of langchain Document objects. 
    Returns:
        dict: A dictionary where keys are tuples (source, page), and values are lists of documents.
    """
    documents_grouped = defaultdict(lambda: defaultdict(list))

    for doc in documents:
        source = doc.metadata['source']
        page = doc.metadata['page']
        
        # Group documents by source and page
        documents_grouped[source][page].append(doc)

    # Flatten into a dictionary of (source, page) -> list of documents
    grouped_documents = {}
    for source, pages in documents_grouped.items():
        for page, docs in pages.items():
            grouped_documents[(source, page)] = docs

    return grouped_documents

In [36]:
def extract_titles_from_grouped_documents(documents: List[Document]):
    grouped_documents = group_documents_by_source_and_page(documents)
    
    all_titles = []
    
    # Apply extract_book_title to each group with progress bar
    for (source, page), docs in tqdm(grouped_documents.items(), desc="Processing Groups", unit="group"):
        titles = extract_book_title(docs)
        all_titles.append(titles)  # Add the titles of the current group to the unified list
    
    return all_titles

In [129]:
titles_info = extract_titles_from_grouped_documents(proc_docs)
titles_info

Processing Groups:  67%|██████▋   | 2/3 [16:57<08:28, 508.70s/group]


KeyboardInterrupt: 

In [92]:
titles

['["A Magazine", \n"MAK: The Architecture of Byoungsoo Cho*", \n"Archives 7: Francisco Mangado", \n"Archives 6: Solano Benítez & Gloria Cabral", \n"Encounters with Plečnik", \n"Pitsou Kedem Architects – Works and Projects", \n"Robin Boyd: Late Works", \n"Dudok by Iwan Baan"]',
 '["Immortal: Lost Memoirs of Cornelia Dulac Concerning the Freshwater Polyp Hydra",\n    "The Wanderer*",\n    "Bud Book",\n    "Clouds and Bombs*",\n    "Jörg Schmeisser Retrospective: Neverending Journeys",\n    "Paradise On Paper Where Flowers Bloom, Birds Sing",\n    "Mirror Creation*",\n    "Practice of Spiral Practice of Spiral"]',
 '["Goblins", "The Cult of Water", "Satan is Real: Two Short Stories", "Empty Aphrodite: An Encyclopaedia of Fate", "Bruce Hamana Sosei – 100 Beautiful Words in the Way of Tea", "Aesthetics as Space", "Errant Journal 1: Where are We?", "Unpacking My Library"]']

In [57]:
def extract_isbn_numbers(isbn_strings: List[str]) -> List[int]:
    isbn_numbers = []
    
    for isbn_str in isbn_strings:
        # Find all ISBN numbers in the string
        matches = re.findall(r'\d{13}', isbn_str)
        # Convert matches to integers and add to the list
        isbn_numbers.extend([int(match) for match in matches])
    
    return isbn_numbers

In [97]:
import json
from typing import List

def combine_text_info(json_strings: List[str]) -> List[str]:
    combined_list = []
    
    for json_str in json_strings:
        try:
            # Parse the JSON string
            text_list = json.loads(json_str)
            # Extend the combined list with the parsed list
            combined_list.extend(text_list)
        except json.JSONDecodeError:
            print(f"Error decoding JSON: {json_str}")
            continue
    
    return combined_list

In [126]:
book_name = combine_text_info(titles_info)
book_name

Error decoding JSON: Here are the extracted book details:

["A Magazine, Antwerp 2020", "Architectural Publisher B, Copenhagen 2020", "C2C Editorial, La Coruña 2020", "C2C Editorial, La Coruña 2020", "Museum of Architecture and Design, Ljubljana 2020", "A.MAG Editora, Porto 2020", "URO Publications, Melbourne 2020", "nai010 Publishers, Rotterdam 2020"]
Error decoding JSON: Here are the extracted book details in the requested format:


["4380. Immortal: Lost Memoirs of Cornelia Dulac Concerning the Freshwater Polyp Hydra",
"Aalto University, Helsinki 2020",
"Canadian biologist Cornelia Dulac has been missing since 2014.",
"Her audiotapes were discovered at a remote cabin in Eastern Finland, itself a fully-equipped research laboratory with a freshwater research laboratory with a freshwater well, gasoline for a generator, and a year’s supply of food.",
"She had been researching hydra, a seemingly immortal freshwater polyp. Obviously, something interrupted Dulac’s plans.",
"This book by ar

[]

In [202]:
PROMPT_TEMPLATE = """
    Based on the provided document, extract the following details and return ONLY in valid JSON format:

    {{
        "Full_title": "<Name or Title>",
        "Publisher_title": "<Publisher Title>",
        "City": "<City Name>",
        "Year": "<Year>",
        "ISBN": "<Only ISBN number>",
        "price": "<Only numeric price>",
        "book_shop_name": "<Bookshop Name>",
        "book_shop_id": "<Only numeric Bookshop ID>",
        "pages": "<Only number of pages>",
        "colour": "<Colour details>",
        "size": "<Size>",
        "language": "<Language>"
    }}

    If multiple books are found, return them as separate JSON objects.
    Do NOT add any extra text, explanations, or formatting.

    Context:
    {context}
    """

In [111]:
from typing import List
from tqdm import tqdm
from langchain_core.documents import Document
from langchain_community.vectorstores import VectorStore

def extract_book_info(documents: List[Document], book_list: List[str], vector_store: VectorStore):
    PROMPT_TEMPLATE = """Based on the provided document, extract the following details and return ONLY in valid JSON format:
    {{
        "Full_title": "<Full_title>",
        "City": "<City Name>",
        "Year": "<Year>",
        "ISBN": "<ISBN number>",
        "price": "numeric price>",
        "book_shop_id": "<Only numeric Bookshop ID>",
        "pages": "<Only number of pages>",
        "colour": "<Colour details>",
        "size": "<Size>",
        "language": "<Language>"
    }}

    Do NOT add any extra text, explanations, or formatting.

    Context:
    {context}
    """

    extracted_info = []
    generation_model = OllamaLLM(model="llama3.1")  # Initialize LLM once

    for book in tqdm(book_list, desc="Processing books", unit="book"):
        query = f"Find details about '{book}'"
        results = vector_store.similarity_search(query, k=5)

        # Filter results to ensure they are related to the current book
        #filtered_results = [doc for doc in results if book.lower() in doc.page_content.lower()]
        context = " ".join([result.page_content for result in results])

        # If no relevant documents are found, skip processing
        if not context.strip():
            print(f"Warning: No relevant context found for book '{book}'")
            extracted_info.append(f'{{"Full_title": "{book}", "error": "No data found"}}')
            continue

        # Generate response
        prompt = PROMPT_TEMPLATE.format(context=context)
        response = generation_model.invoke(prompt).strip()

        extracted_info.append(response)

    return extracted_info  # Return list of extracted book details in string format


In [102]:
book_store = faiss_db(proc_docs)

In [112]:
extracted_info = extract_book_info(proc_docs, book_name, book_store)
print(extracted_info)

Processing books: 100%|██████████| 24/24 [15:49<00:00, 39.58s/book]

['{\n    "Full_title": "A Magazine, Antwerp 2020",\n    "City": "Antwerp",\n    "Year": "2020",\n    "ISBN": "9789077745212",\n    "price": "15.50",\n    "book_shop_id": "20253",\n    "pages": "222",\n    "colour": "colour & bw",\n    "size": "17 x 21 cm",\n    "language": "English"\n}', '{\n    "Full_title": "The Architecture of Byoungsoo Cho",\n    "City": "Copenhagen",\n    "Year": "2020",\n    "ISBN": "9788792700322",\n    "price": "61.70",\n    "book_shop_id": "",\n    "pages": "408",\n    "colour": "colour & bw",\n    "size": "23 x 33 cm",\n    "language": "English"\n}', '{\n    "Full_title": "Practice of Spiral Torch Press, Tokyo 2020",\n    "City": "Tokyo",\n    "Year": "2020",\n    "ISBN": "9784907562212",\n    "price": "49.50",\n    "book_shop_id": "20247",\n    "pages": "304",\n    "colour": "colour & bw",\n    "size": "17 x 24 cm",\n    "language": "Spanish/English"\n}', '{\n    "Full_title": "16. Encounters with Plečnik",\n    "City": "La Coruña",\n    "Year": "2020",\n   




In [108]:
import pandas as pd
import json
import re
from typing import List, Union

def preprocess_field(value: Union[str, None], field_type: str) -> str:
    if not isinstance(value, str):
        return ""
    if field_type == "price":
        # Remove any non-numeric characters except for the decimal separator
        value = re.sub(r'[^\d,\.]', '', value)
        # Replace comma with dot if necessary
        value = value.replace(',', '.')
    elif field_type in ["ISBN", "Year", "book_shop_id", "pages"]:
        # Remove any non-numeric characters
        value = re.sub(r'[^\d]', '', value)
    return value

def clean_json_string(json_str: str) -> List[str]:
    # Remove leading/trailing non-JSON characters and split into individual JSON objects
    json_str = json_str.strip('```').strip()
    json_objects = re.findall(r'\{.*?\}', json_str, re.DOTALL)
    return json_objects

def create_dataframe_from_json_strings(json_strings: List[str]) -> pd.DataFrame:
    # List to store parsed JSON objects
    parsed_data = []
    
    # Parse each JSON string
    for json_str in json_strings:
        json_objects = clean_json_string(json_str)
        for obj_str in json_objects:
            try:
                book_info = json.loads(obj_str)
                # Preprocess relevant fields
                for field in ["price", "ISBN", "Year", "book_shop_id", "pages"]:
                    if field in book_info:
                        book_info[field] = preprocess_field(book_info[field], field)
                parsed_data.append(book_info)
            except json.JSONDecodeError:
                print(f"Error decoding JSON: {obj_str}")
                continue
    
    # Create DataFrame from parsed data
    df = pd.DataFrame(parsed_data)
    
    # Ensure the DataFrame has the desired columns
    desired_columns = ["Full_title", "City", "Year", "ISBN", "price", "book_shop_id", "pages", "colour", "size", "language"]
    df = df.reindex(columns=desired_columns)
    
    return df

In [109]:
df = create_dataframe_from_json_strings(extracted_info)

In [113]:
df.head(2)

Unnamed: 0,Full_title,City,Year,ISBN,price,book_shop_id,pages,colour,size,language
0,Floragatan 13: Curated by Acne Studios,Antwerp,2020.0,9789077745212.0,15.5,20253.0,,,,
1,MAK: The Architecture of Byoungsoo Cho*,,,,,,,,,


In [114]:
df2 = create_dataframe_from_json_strings(extracted_info)
df2.head(23)

Unnamed: 0,Full_title,City,Year,ISBN,price,book_shop_id,pages,colour,size,language
0,"A Magazine, Antwerp 2020",Antwerp,2020,9789077745212.0,15.5,20253.0,222.0,colour & bw,17 x 21 cm,English
1,The Architecture of Byoungsoo Cho,Copenhagen,2020,9788792700322.0,61.7,,408.0,colour & bw,23 x 33 cm,English
2,"Practice of Spiral Torch Press, Tokyo 2020",Tokyo,2020,9784907562212.0,49.5,20247.0,304.0,colour & bw,17 x 24 cm,Spanish/English
3,16. Encounters with Plečnik,La Coruña,2020,9788412162516.0,25.2,20203.0,304.0,ills colour & bw,17 x 24 cm,Spanish/English
4,"52 p, ills colour & bw, 15 x 21 cm, pb, Sloven...",Porto,2020,9789895462049.0,49.5,20161.0,52.0,ills colour & bw,15 x 21 cm,Slovenian/English
5,Pitsou Kedem Architects – Works and Projects,,2020,9789895462049.0,49.5,20161.0,52.0,ills colour & bw,15 x 21 cm,Slovenian/English
6,Robin Boyd: Late Works,Melbourne,2020,9780648435594.0,38.8,14.0,152.0,colour & bw,24 x 28 cm,English
7,"108 p, ills colour & bw, 22 x 30 cm, pb",,2020,9789462085817.0,39.95,,108.0,ills colour & bw,22 x 30 cm,English
8,4380. Immortal: Lost Memoirs of Cornelia Dulac...,Helsinki,2020,9789526089621.0,41.5,20116.0,128.0,ills colour & bw,20 x 22 cm,English
9,Monique Besten – The Wanderer*,Barcelona,2020,9788412039092.0,15.75,,128.0,ills colour & bw,20 x 22 cm,English


In [None]:
import os
import re
from typing import List
from langchain.schema import Document
from langchain.document_loaders import CSVLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_csv_files(directory: str) -> List[Document]:
    """
    Load and return the content of all CSV files in the given directory.
    """
    documents = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".csv"):
            file_path = os.path.join(directory, file_name)
            loader = CSVLoader(file_path)
            documents.extend(loader.load())
    return documents

def split_docs(documents: List[Document], chunk_size: int = 400, chunk_overlap: int = 40) -> List[Document]:
    """
    Split documents into chunks using RecursiveCharacterTextSplitter.
    
    Args:
        documents (List[Document]): List of Document objects to be split.
        chunk_size (int): Maximum size of each chunk.
        chunk_overlap (int): Overlap size between chunks.

    Returns:
        List[Document]: List of split Document objects.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def remove_garbage_lines(text: str) -> str:
    """
    Removes lines that contain mostly numbers, standalone letters, or patterns like 'B = B', 'M = M'.
    """
    cleaned_lines = []
    
    for line in text.split("\n"):
        line = line.strip()
        
        # Skip lines that are mostly numbers, letters with =, or repeating patterns
        if re.match(r'^([\d\s]+|[A-Z]\s*=\s*[A-Z]\s*)+$', line):
            continue
        
        # Skip lines with excessive letter-number-symbol sequences (like slurB B B 0 B B)
        if re.search(r'(slurB|B\s*=\s*B|M\s*=\s*M|Y\s*=\s*Y|X\s*=\s*X|Z\s*=\s*Z)', line):
            continue
        
        cleaned_lines.append(line)
    
    return "\n".join(cleaned_lines)

def load_pdf_files(directory: str) -> List[Document]:
    """
    Load and return the content of all PDF files in the given directory.
    """
    documents = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".pdf"):
            file_path = os.path.join(directory, file_name)
            loader = PyPDFLoader(file_path)
            pdf_docs = loader.load()
            
            for doc in pdf_docs:
                doc.page_content = remove_garbage_lines(doc.page_content)  # Clean extracted text

            documents.extend(pdf_docs)
    return documents

def upload_files(directory: str) -> List[Document]:
    """
    Upload all supported file types from a given directory, split PDF content into chunks, and return their content.
    """
    supported_loaders = {
        "csv": load_csv_files,
        "pdf": load_pdf_files
    }
    documents = []

    for ext, loader_func in supported_loaders.items():
        loaded_documents = loader_func(directory)
        if ext == "pdf":
            documents.extend(split_docs(loaded_documents))  # Split PDFs into chunks
        else:
            documents.extend(loaded_documents)
    
    return documents

# Example usage
test_directory = 'C:/Users/skrge/Documents/GitHub/llmtesting/data/test/test'
pdf_docs = upload_files(test_directory)
proc_docs = process_documents(pdf_docs)