In [1]:
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS
from IPython.display import display, Markdown

# Excel

In [None]:
# Load Excel
excel_file_path = r"D:\All Python\Pure-Python\P4\06-PromptEngineering\Project 2 - RAG\data\earthq.xlsx"
loader = UnstructuredExcelLoader(excel_file_path, mode="elements")
docs = loader.load()

# display the five first elements
docs[:5]

[Document(metadata={'source': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data\\earthq.xlsx', 'file_directory': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data', 'filename': 'earthq.xlsx', 'last_modified': '2024-11-07T21:28:58', 'page_name': 'earthq', 'page_number': 1, 'text_as_html': '<table><tr><td>PEER Ground Motion Database</td><td>Time Series Search Report -- NGA-West2 -- 2024-03-04</td></tr></table>', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Table', 'element_id': '045bcdeb05b66c364675bc215fcb3bb7'}, page_content='PEER Ground Motion Database Time Series Search Report -- NGA-West2 -- 2024-03-04'),
 Document(metadata={'source': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data\\earthq.xlsx', 'file_directory': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data', 'filename': 'earthq.xlsx', 'last_mod

In [31]:
len(docs[9].metadata["text_as_html"])

553142

In [38]:
# Split the document into chunks
text_spliter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                              chunk_overlap=200)
chunk = text_spliter.split_documents(docs)

chunk[:5]

[Document(metadata={'source': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data\\earthq.xlsx', 'file_directory': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data', 'filename': 'earthq.xlsx', 'last_modified': '2024-11-07T21:28:58', 'page_name': 'earthq', 'page_number': 1, 'text_as_html': '<table><tr><td>PEER Ground Motion Database</td><td>Time Series Search Report -- NGA-West2 -- 2024-03-04</td></tr></table>', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Table', 'element_id': '045bcdeb05b66c364675bc215fcb3bb7'}, page_content='PEER Ground Motion Database Time Series Search Report -- NGA-West2 -- 2024-03-04'),
 Document(metadata={'source': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data\\earthq.xlsx', 'file_directory': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data', 'filename': 'earthq.xlsx', 'last_mod

In [39]:
len(chunk), len(docs)

(515, 10)

In [41]:
# Embeddings with huggingface and faiss
embeddings = HuggingFaceEmbeddings()
vector_store = FAISS.from_documents(chunk, embeddings)

In [42]:
query = "give me my the information about  Imperial Valley-02 earthquake please."

docs_faiss = vector_store.similarity_search_with_score(query, k=5)
docs_faiss

[(Document(metadata={'source': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data\\earthq.xlsx', 'file_directory': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data', 'filename': 'earthq.xlsx', 'last_modified': '2024-11-07T21:28:58', 'page_name': 'earthq', 'page_number': 1, 'text_as_html': '<table><tr><td>Magnitude Min:</td><td>6.24</td></tr><tr><td>Magnitude Max:</td><td>7.68</td></tr><tr><td>Rrup Min (km):</td><td>-0.000001</td></tr><tr><td>Rrup Max (km):</td><td>99999</td></tr><tr><td>Rjb Min (km):</td><td>-0.000001</td></tr><tr><td>Rjb Max (km):</td><td>99999</td></tr><tr><td>Vs30 Min (m/sec):</td><td>-0.000001</td></tr><tr><td>Vs30 Max (m/sec):</td><td>99999</td></tr><tr><td>D9-95 Min (sec):</td><td>-0.000001</td></tr><tr><td>D9-95 Max (sec):</td><td>99999</td></tr><tr><td>Scale Factor Min:</td><td>-0.000001</td></tr><tr><td>Scale Factor Max:</td><td>99999</td></tr><tr><td>Period Array:</td><td/></tr><tr><td>Weight Array:</td>

In [None]:
docs_faiss[4][1] # score

1.2188771

In [49]:
# Merge the docs to use in the gen system
context_text = "\n\n".join([doc.page_content for doc, _score in docs_faiss])
context_text

'Magnitude Min: 6.24 Magnitude Max: 7.68 Rrup Min (km): -0.000001 Rrup Max (km): 99999 Rjb Min (km): -0.000001 Rjb Max (km): 99999 Vs30 Min (m/sec): -0.000001 Vs30 Max (m/sec): 99999 D9-95 Min (sec): -0.000001 D9-95 Max (sec): 99999 Scale Factor Min: -0.000001 Scale Factor Max: 99999 Period Array: Weight Array: Record Sequence Number: Earthquake Name: Station Name: User-Defined Maximum Number of Records: Pulse: 1 Damping Ratio: 0.05 Scaling Method: No Scaling Suite Average: Arithmetic Single-Period-Scaling Period (sec): Component: SRSS Fault Type: All Types\n\nThese records are UNSCALED AS RECORDED Filename Extension: AT2 = Acceleration VT2 = Velocity DT2 = Displacement The users of the NGA-West2 databases flatfiles models and reports are requested to acknowledge the Pacific Earthquake Engineering Research Center (PEER) in their work and publications.\n\nResult ID Spectral Ordinate Record Sequence Number Mean Squared Error Scale Factor Tp-Pulse Period (sec) 5-75% Duration (sec) 5-95% D

In [50]:
# Create a simple prompt for a Rag system
prompt = f"""based on this context {context_text}
please answer this question {query}
if you don't know the answer just say you don't know."""

In [11]:
# Call the groq apo with the langchain
models = {"Google": ["gemma2-9b-it", "gemma-7b-it"],
          "Groq": ["llama3-groq-70b-8192-tool-use-preview", "llama3-groq-8b-8192-tool-use-preview"],
          "Meta": ["llama-3.1-70b-versatile", "llama-3.1-8b-instant", "llama-3.2-1b-preview", "llama-3.2-3b-preview", "llama-3.2-11b-vision-preview", "llama-3.2-90b-vision-preview", "llama-guard-3-8b", "llama3-70b-8192", "llama3-8b-8192"],
          "Mistral": ["mixtral-8x7b-32768"],
          "OpenAI": ["whisper-large-v3", "whisper-large-v3-turbo"]}
model = ChatGroq(api_key="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",
                 model=models["Meta"][0],
                 temperature=0)
response_text = model.invoke(prompt)

In [None]:
display(Markdown(response_text.content)) # Pashmammmm

Based on the provided information, here's what I found about the Imperial Valley-02 earthquake:

1. Earthquake Name: Imperial Valley-02
2. Year: 1940
3. Station Name: El Centro Array #9
4. Magnitude: 6.95
5. Mechanism: Strike slip
6. Rjb (km): 6.09
7. Rrup (km): 6.09
8. Vs30 (m/sec): 213.44
9. Lowest Useable Frequency (Hz): 0.25

If you need more information, please let me know, and I'll do my best to provide it.

In [69]:
# Make a function out of it.
def prepare_excel(file_path, chunk_size=1000, chunk_overlap=50):
    # Loading the data
    loader = UnstructuredExcelLoader(file_path=file_path, mode="elements")
    docs = loader.load()

    # Split the text into chunks
    text_spliter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunk = text_spliter.split_documents(docs)

    # Prepare the embeddings
    embeddings = HuggingFaceEmbeddings()

    # FAISS index
    db_faiss = FAISS.from_documents(chunk, embeddings)

    return db_faiss

In [9]:
# Prepare a function to retrieve and generate Rag
def ask(df, query, k, model_name):
    # Getting the context
    docs_faiss = df.similarity_search_with_score(query, k=k)
    context_text = "\n\n".join([doc.page_content for doc, _score in docs_faiss])

    # Define the prompt
    prompt = f"""
    based on this context {context_text}
    please answer this question {query}
    if you don't know the answer just say you don't know.
    """

    # Call the LLM
    model = ChatGroq(api_key="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",
                 model=model_name,
                 temperature=0)
    
    response_text = model.invoke(prompt)

    return display(Markdown(response_text.content))

In [None]:
# Preparing the excel Data
df_excel = prepare_excel(excel_file_path)

In [59]:
query = "Give me the earchquakes that have the magnetude greater that 7."

In [68]:
ask(vector_store, query, 24, models["Meta"][0])

Based on the provided text, the earthquakes with a magnitude greater than 7 are:

1. Kern County (1952) - Magnitude: 7.36
2. Tabas, Iran (1978) - Magnitude: 7.35
3. Gazli, USSR (1976) - Magnitude: 6.8 (Note: This is not greater than 7, I apologize for the mistake)
4. Friuli, Italy (1976) - Magnitude: 6.5 (Note: This is not greater than 7, I apologize for the mistake)

Corrected answer:

1. Kern County (1952) - Magnitude: 7.36
2. Tabas, Iran (1978) - Magnitude: 7.35

# Word

In [79]:
import nltk
nltk.download('punkt')
from langchain_community.document_loaders import UnstructuredWordDocumentLoader

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\keipj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [81]:
# Load the word document with data parsed as individual elemnts
word_file_path = r"D:\All Python\Pure-Python\P4\06-PromptEngineering\Project 2 - RAG\data\ML.docx"
loader = UnstructuredWordDocumentLoader(
    word_file_path,
    mode="elements"
)
docs = loader.load()
docs[:5]

[Document(metadata={'source': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data\\ML.docx', 'category_depth': 0, 'emphasized_text_contents': ['4', '- ماشین بردار پشتیبان'], 'emphasized_text_tags': ['b', 'b'], 'file_directory': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data', 'filename': 'ML.docx', 'last_modified': '2024-05-09T19:51:41', 'page_number': 1, 'languages': ['fas'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': '5902da3b83485984d98e94b616aa4593'}, page_content='4- ماشین بردار پشتیبان'),
 Document(metadata={'source': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data\\ML.docx', 'category_depth': 0, 'emphasized_text_contents': ['1-4- مقدمه', 'ای بر نظریة یادگیری آماری'], 'emphasized_text_tags': ['b', 'b'], 'file_directory': 'D:\\All Python\\Pure-Python\\P4\\06-PromptEngineering\\Project 2 - RAG\\data', 'filename': 'ML.d

In [98]:
# Create a function to prepare the word document
def prepare_word(file_path, chunk_size=1000, chunk_overlap=100):
    loader = UnstructuredWordDocumentLoader(
    file_path,
    mode="elements")
    docs = loader.load()

    # Split to chunks, embeddings and Faiss
    text_spliter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
            )
    chunk = text_spliter.split_documents(docs)
    embeddings = HuggingFaceEmbeddings()
    db_faiss = FAISS.from_documents(chunk, embeddings)

    return db_faiss

In [88]:
# Define a couple of queries
query_1 = "Tell me about the document by giving the 3 main points in English."
query_2 = "What are the citations in this document in English?"

In [86]:
# Prepare the word data
db_doc = prepare_word(word_file_path)

In [92]:
ask(db_doc, query_1, k=5, model_name=models["Meta"][0])

Based on the provided context, it appears to be a table of contents or an outline of a document related to machine learning and statistical learning. Here are the 3 main points in English:

1. The document covers the basics of statistical learning theory (1-4).
2. It discusses support vector machines (SVMs), including soft-margin support vector regression (3-7-4) and support vector classification (2-4).
3. The document also touches on principal component analysis (PCA) with kernel methods (3-5-4).

Please note that this is an interpretation based on the provided context, and the actual content of the document may vary.

In [93]:
ask(db_doc, query_2, k=5, model_name=models["Meta"][0])


I don't know.

# PowerPoints

In [94]:
powerpoint_file_path1 = r"D:\All Python\Pure-Python\P4\06-PromptEngineering\Project 2 - RAG\data\SM, W00, Ch 8, Seepage.pptx"
powerpoint_file_path2 = r"D:\All Python\Pure-Python\P4\06-PromptEngineering\Project 2 - RAG\data\Inside Reading 4(7.2).pptx"

In [95]:
from langchain_community.document_loaders import UnstructuredPowerPointLoader

In [97]:
# Create a function to prepare the PowerPoint document
def prepare_powerpoint(file_path, chunk_size=1000, chunk_overlap=100):
    loader = UnstructuredPowerPointLoader(
    file_path,
    mode="elements")
    docs = loader.load()

    # Split to chunks, embeddings and Faiss
    text_spliter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
            )
    chunk = text_spliter.split_documents(docs)
    embeddings = HuggingFaceEmbeddings()
    db_faiss = FAISS.from_documents(chunk, embeddings)

    return db_faiss

In [99]:
db_powerpoint1 = prepare_powerpoint(powerpoint_file_path1, chunk_size=500, chunk_overlap=300)
db_powerpoint2 = prepare_powerpoint(powerpoint_file_path2, chunk_size=500, chunk_overlap=200)


In [110]:
query_1 = "What is the powepoint about in English?"
query_2 = "How is the powerpoint organized? Is it a good powerpoint or not?"
query_3 = "what is the bernoli equation?"
query_4 = "Who is the writer or provider of the powerpoint?"

In [106]:
ask(db_powerpoint1, query_1, k=30, model_name=models["Meta"][0])

The PowerPoint appears to be about "Seepage through Soil" and possibly related to civil engineering or geotechnical engineering, specifically discussing the flow of water through soil and the potential for erosion or head loss.

In [108]:
ask(db_powerpoint1, query_3, k=30, model_name=models["Meta"][0])


I don't know. The provided context does not explicitly state the Bernoulli equation. It mentions Bernoulli, but does not provide the equation itself.

In [111]:
ask(db_powerpoint1, query_4, k=30, model_name=models["Meta"][0])


M.M.Ahmadi

In [107]:
ask(db_powerpoint2, query_1, k=30, model_name=models["Meta"][0])


The PowerPoint appears to be about the concept of "The Wisdom of Crowds," which suggests that collective judgments and decisions made by a group of people can be more accurate and intelligent than those made by a single individual, even if the individuals in the group are not experts. The text references the work of Francis Galton, a British scientist who conducted an experiment at a country fair where a crowd of people estimated the weight of an ox, and surprisingly, the average of their guesses was very close to the actual weight. The PowerPoint also touches on the idea that groups can be more intelligent and make better decisions when individuals are allowed to think and act independently, rather than being forced to conform to a single opinion.

In [112]:
ask(db_powerpoint2, query_2, k=30, model_name=models["Meta"][0])


The PowerPoint appears to be organized around the concept of "The Wisdom of Crowds," with various slides discussing the idea that collective intelligence can be more accurate and effective than individual expertise. The slides seem to be a mix of quotes, examples, and explanations, with some repetition of key points.

As for whether it's a good PowerPoint or not, I'd say it's decent but could be improved. Here's why:

Strengths:

* The PowerPoint covers a clear and interesting topic.
* It includes quotes and examples to support the main idea.
* The language is generally clear and concise.

Weaknesses:

* The slides seem to be a bit disjointed, with some ideas and quotes repeated multiple times.
* There's no clear structure or flow to the presentation.
* Some slides appear to be just a block of text, which can be overwhelming to read.
* There are no visuals, images, or graphics to break up the text and make the presentation more engaging.

Overall, the PowerPoint could benefit from some reorganization, editing, and design improvements to make it more effective and engaging for the audience.

In [113]:
ask(db_powerpoint2, query_4, k=30, model_name=models["Meta"][0])


The writer or provider of the PowerPoint is Keivan Jamali.

# EBooks

In [114]:
from langchain_community.document_loaders import UnstructuredEPubLoader
import pypandoc
pypandoc.download_pandoc()

In [None]:
# Create a function to prepare the EBook document
def prepare_epub(file_path, chunk_size=1000, chunk_overlap=100):
    loader = UnstructuredEPubLoader(
    file_path,
    mode="elements")
    docs = loader.load()

    # Split to chunks, embeddings and Faiss
    text_spliter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
            )
    chunk = text_spliter.split_documents(docs)
    embeddings = HuggingFaceEmbeddings()
    db_faiss = FAISS.from_documents(chunk, embeddings)

    return db_faiss

# PDF

In [2]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from pdfminer import psparser

In [3]:
pdf_file_path = r"D:\All Python\Pure-Python\P4\06-PromptEngineering\Project 2 - RAG\data\Fundamentals of Fluid Mechanics-Munson-7ed.pdf"

In [4]:
# Create a function to prepare the PDF document
def prepare_pdf(file_path, chunk_size=1000, chunk_overlap=100):
    loader = UnstructuredPDFLoader(
    file_path,
    mode="elements")
    docs = loader.load()

    # Split to chunks, embeddings and Faiss
    text_spliter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
            )
    chunk = text_spliter.split_documents(docs)
    embeddings = HuggingFaceEmbeddings()
    db_faiss = FAISS.from_documents(chunk, embeddings)

    return db_faiss

In [5]:
db_pdf = prepare_pdf(pdf_file_path)

In [7]:
query_1 = "What is non-Newtonian fluids defenition?"
query_2 = "What are formulas of Compression and Expansion of Gases in part 1.7.2?"
query_3 = "What are the learning objectives of Chaapter 8?"

In [12]:
ask(db_pdf, query_1, k=20, model_name=models["Meta"][0])

Non-Newtonian fluids are defined as fluids for which the shearing stress is not linearly related to the rate of shearing strain. In other words, the apparent viscosity of non-Newtonian fluids changes with the shear rate, unlike Newtonian fluids where the viscosity remains constant.

In [21]:
ask(db_pdf, query_2, k=120, model_name=models["Meta"][0])


The formulas for Compression and Expansion of Gases in part 1.7.2 are not explicitly stated in the provided text. However, based on the context, it appears that the formulas are related to the ideal gas law and the relationship between pressure and density.

From the text, we can infer that the formulas might be:

1. The ideal gas law: p = ρRT (Equation 1.8)
2. The relationship between pressure and density: p/p₀ = (ρ/ρ₀)^k (where k is the specific heat ratio)

Additionally, the text mentions Equations 1.14 and 1.15, but these equations are not explicitly stated. If you need more information, please provide more context or clarify what you are looking for.

In [38]:
ask(db_pdf, query_3, k=100, model_name=models["Meta"][0])


I don't know. The text does not provide the specific learning objectives for Chapter 8. It only mentions that there is a Learning Objectives section at the beginning of each chapter, but it does not provide the actual objectives for Chapter 8.