In [22]:
import os
from langchain.text_splitter import CharacterTextSplitter , RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings , ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage


__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [2]:

# current_dir = os.path.dirname(os.path.abspath(__file__))
current_dir = os.getcwd()
files_path = os.path.join(current_dir,"text")
input_path = os.path.join(current_dir,"pdf")
file_path = os.path.join(current_dir,"text","test.txt")
persist_dir = os.path.join(current_dir,"db","chroma_db_meta")

texts_dir = os.path.join(current_dir, "text")
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata")

print(f"Books directory: {texts_dir}")
print(f"Persistent directory: {persistent_directory}")

/workspaces/RAG_v2/db/chroma_db_meta


In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader

def pdf_to_txt(input_folder, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Iterate through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(input_folder, filename)
            txt_filename = os.path.splitext(filename)[0] + '.txt'
            txt_path = os.path.join(output_folder, txt_filename)
            
            try:
                # Load PDF
                loader = PyPDFLoader(pdf_path)
                pages = loader.load()
                
                # Extract text from all pages
                full_text = '\n'.join([page.page_content for page in pages])
                
                # Write text to file
                with open(txt_path, 'w', encoding='utf-8') as txt_file:
                    txt_file.write(full_text)
                
                print(f"Converted {filename} to {txt_filename}")
            
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

# Example usage
input_folder = input_path
output_folder = files_path
pdf_to_txt(input_folder, output_folder)

Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 103 0 (offset 0)


Converted Leveraging+Machine+Learning+Algorithms+for+Risk+Assessment+in+Auto+Insurance (1).pdf to Leveraging+Machine+Learning+Algorithms+for+Risk+Assessment+in+Auto+Insurance (1).txt


Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)


Converted ijerph-19-07898 (1).pdf to ijerph-19-07898 (1).txt
Converted Machine+Learning+Models+for+Life+Insurance+Risk+Assessment-+Techniques,+Applications,+and+Case+Studies+.pdf to Machine+Learning+Models+for+Life+Insurance+Risk+Assessment-+Techniques,+Applications,+and+Case+Studies+.txt
Converted 1844-McKinsey-insurance-2030-the-impact-of-ai-on-the-future-of-insurance-f (1).pdf to 1844-McKinsey-insurance-2030-the-impact-of-ai-on-the-future-of-insurance-f (1).txt
Converted risks-09-00042-v2 (1).pdf to risks-09-00042-v2 (1).txt


In [3]:
if not os.path.exists(files_path):
    raise FileNotFoundError(f"file {files_path} is not exists , please check the path")
loader = TextLoader(file_path)  
documents = loader.load()
text_split = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)
docs = text_split.split_documents(documents) 

In [11]:
print(f"number of documents : {len(docs)}")
print(f"sample chunk: \n{docs[0].page_content}\n")

number of documents : 14
sample chunk: 
Journal of Artificial Intelligence Research By The Science Brigade (Publishing) Group  21  
 Journal of Artificial Intelligence Research  Volume 1 Issue 1 Semi Annual Edition | Spring 2021 This work is licensed under CC BY-NC-SA 4.0. View complete license here Leveraging Machine Learning Algorithms for Risk Assessment in Auto Insurance By Pankaj Zanke* & Dipti Sontakke** * Project Manager, Progressive Insurance, Cleveland, Ohio, USA https://orcid.org/0009-0002-4341-2972 ** Consultant, Capgemini Inc, Atlanta, GA, USA https://orcid.org/0009-0009-5381-4837   Abstract:  This paper delves into the burgeoning domain of leveraging machine learning (ML) algorithms for risk assessment in the auto insurance sector. It investigates the application of diverse ML techniques for predictive modeling, encompassing claims frequency, severity estimation, and fraud detection. By analyzing vast datasets, ML algorithms offer promising avenues for enhancing risk asses

In [4]:
texts_dir = os.path.join(current_dir, "text")
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata")

print(f"Books directory: {texts_dir}")
print(f"Persistent directory: {persistent_directory}")

Books directory: /workspaces/RAG_v2/text
Persistent directory: /workspaces/RAG_v2/db/chroma_db_with_metadata


In [5]:
# Check if the Chroma vector store already exists
if not os.path.exists(persistent_directory):
    print("Persistent directory does not exist. Initializing vector store...")

    # Ensure the books directory exists
    if not os.path.exists(files_path):
        raise FileNotFoundError(
            f"The directory {files_path} does not exist. Please check the path."
        )

    # List all text files in the directory
    book_files = [f for f in os.listdir(files_path) if f.endswith(".txt")]

    # Read the text content from each file and store it with metadata
    documents = []
    for book_file in book_files:
        file_path = os.path.join(files_path, book_file)
        loader = TextLoader(file_path)
        book_docs = loader.load()
        for doc in book_docs:
            # Add metadata to each document indicating its source
            doc.metadata = {"source": book_file}
            documents.append(doc)

    # Split the documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=10)
    docs = text_splitter.split_documents(documents)

    # Display information about the split documents
    print("\n--- Document Chunks Information ---")
    print(f"Number of document chunks: {len(docs)}")

    # Create embeddings
    print("\n--- Creating embeddings ---")
    embeddings = OpenAIEmbeddings(
        model="text-embedding-3-small"
    )  # Update to a valid embedding model if needed
    print("\n--- Finished creating embeddings ---")

    # Create the vector store and persist it
    print("\n--- Creating and persisting vector store ---")
    db = Chroma.from_documents(
        docs, embeddings, persist_directory=persistent_directory)
    print("\n--- Finished creating and persisting vector store ---")

else:
    print("Vector store already exists. No need to initialize.")

Created a chunk of size 14518, which is longer than the specified 200
Created a chunk of size 4389, which is longer than the specified 200
Created a chunk of size 7004, which is longer than the specified 200
Created a chunk of size 1912, which is longer than the specified 200
Created a chunk of size 1708, which is longer than the specified 200
Created a chunk of size 1447, which is longer than the specified 200
Created a chunk of size 9348, which is longer than the specified 200


Persistent directory does not exist. Initializing vector store...

--- Document Chunks Information ---
Number of document chunks: 14

--- Creating embeddings ---

--- Finished creating embeddings ---

--- Creating and persisting vector store ---

--- Finished creating and persisting vector store ---


In [10]:
# Define the embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing vector store with the embedding function
db = Chroma(persist_directory=persistent_directory,
            embedding_function=embeddings)

# Define the user's question
query = "How will AI impact the future of insurance by 2030?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.5},
)
relevant_docs = retriever.invoke(query)

# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    print(f"Source: {doc.metadata['source']}\n")


--- Relevant Documents ---
Document 1:
Insurance Practice
Insurance 2030—  
The impact of AI on the 
future of insurance
The industry is on the verge of a seismic, tech-driven shift. A focus on 
four areas can position carriers to embrace this change.
March 2021
© Imaginima/Getty Imagesby Ramnath Balasubramanian, Ari Libarikian, and Doug McElhaney
Welcome to the future of insurance,  as seen 
through the eyes of Scott, a customer in the year 
2030. His digital personal assistant orders him a 
vehicle with self-driving capabilities for a meeting 
across town. Upon hopping into the arriving car, 
Scott decides he wants to drive today and moves 
the car into “active” mode. Scott’s personal 
assistant maps out a potential route and shares 
it with his mobility insurer, which immediately 
responds with an alternate route that has a much 
lower likelihood of accidents and auto damage as 
well as the calculated adjustment to his monthly 
premium. Scott’s assistant notifies him that his 
mobi

In [19]:
# Define the embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing vector store with the embedding function
db = Chroma(persist_directory=persistent_directory,
            embedding_function=embeddings)

# Define the user's question
query = "How will AI impact the future of insurance by 2030?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.5},
)
relevant_docs = retriever.invoke(query)

# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")

# Combine the query and the relevant document contents
combined_input = (
    "Here are some documents that might help answer the question: "
    + query
    + "\n\nRelevant Documents:\n"
    + "\n\n".join([doc.page_content for doc in relevant_docs])
    + "\n\nPlease provide an answer based only on the provided documents. If the answer is not found in the documents, respond with 'I'm not sure'."
)

# Create a ChatOpenAI model
model = ChatOpenAI(model="gpt-4o")

# Define the messages for the model
messages = [
    SystemMessage(content="You are a helpful assistant and professional in ml engineer with 10 years of experience."),
    HumanMessage(content=combined_input),
]

# Invoke the model with the combined input
result = model.invoke(messages)

# Display the full result and content only
print("\n--- Generated Response ---")
# print("Full result:")
# print(result)
print("Content only:")
display(result.content)


--- Relevant Documents ---
Document 1:
Insurance Practice
Insurance 2030—  
The impact of AI on the 
future of insurance
The industry is on the verge of a seismic, tech-driven shift. A focus on 
four areas can position carriers to embrace this change.
March 2021
© Imaginima/Getty Imagesby Ramnath Balasubramanian, Ari Libarikian, and Doug McElhaney
Welcome to the future of insurance,  as seen 
through the eyes of Scott, a customer in the year 
2030. His digital personal assistant orders him a 
vehicle with self-driving capabilities for a meeting 
across town. Upon hopping into the arriving car, 
Scott decides he wants to drive today and moves 
the car into “active” mode. Scott’s personal 
assistant maps out a potential route and shares 
it with his mobility insurer, which immediately 
responds with an alternate route that has a much 
lower likelihood of accidents and auto damage as 
well as the calculated adjustment to his monthly 
premium. Scott’s assistant notifies him that his 
mobi

'AI is poised to significantly impact the insurance industry by 2030 in several key areas:\n\n1. **Shift to Predict and Prevent**: Insurance will transition from a "detect and repair" model to a "predict and prevent" approach. Advanced AI and deep learning techniques will enable insurers to predict potential risks and prevent them, transforming all aspects of the industry.\n\n2. **Explosion of Data from Connected Devices**: By 2025, there could be up to one trillion connected devices. The data from these devices will help insurers understand their clients better, leading to new product categories, personalized pricing, and real-time service delivery.\n\n3. **Increased Use of Physical Robotics**: Robotics, including 3-D printing, drones, and autonomous vehicles, will change risk assessments and customer expectations. By 2030, many standard vehicles will feature autonomous capabilities, influencing how insurers assess and price risks.\n\n4. **Open-Source and Data Ecosystems**: Data shari

In [18]:
# Display the full result and content only
print("\n--- Generated Response ---")
# print("Full result:")
# print(result)
print("Content only:")
display(result.content)


--- Generated Response ---
Content only:


'AI will significantly impact the future of insurance by 2030 in several ways:\n\n1. **Transformation to Predict and Prevent**: Insurance will shift from a "detect and repair" model to a "predict and prevent" model, leveraging AI to predict risks and prevent losses before they occur. This will transform all aspects of the industry, enhancing decision-making, productivity, lowering costs, and optimizing customer experience.\n\n2. **Data Explosion from Connected Devices**: The proliferation of up to one trillion connected devices by 2025 will provide insurers with vast amounts of data. This data will allow for more personalized pricing, new product categories, and real-time service delivery.\n\n3. **Increased Use of Physical Robotics**: Robotics, including autonomous drones and additive manufacturing (3-D printing), will change risk assessments and create new insurance products. By 2030, autonomous features in vehicles will become more common, changing how risks are pooled and managed.\n