In [2]:
import fitz
from langchain_text_splitters import RecursiveCharacterTextSplitter
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
import numpy as np
from rank_bm25 import BM25Okapi
import pandas as pd

### PDF to List

In [1]:
def get_extracted_text_from_data():
    """
    Extract text from all PDF files in /data.
    """

    # Get the text from the provided PDF files
    pdf2 = "data/pdf_2.pdf"
    doc2 = fitz.open(pdf2)
    pdf4 = "data/pdf_4.pdf"
    doc4 = fitz.open(pdf4)
    pdf5 = "data/pdf_5.pdf"
    doc5 = fitz.open(pdf5)
    pdf6 = "data/pdf_6.pdf"
    doc6 = fitz.open(pdf6)
    pdf7 = "data/pdf_7.pdf"
    doc7 = fitz.open(pdf7)

    extracted_text = ""

    # Combine the text from all pages of the PDFs
    for doc in [doc2, doc4, doc5, doc6, doc7]:
        for page_num in range(doc.page_count):
            page = doc[page_num]
            extracted_text += page.get_text()

    doc2.close()
    doc4.close()
    doc5.close()
    doc6.close()
    doc7.close()
    
    return extracted_text



In [None]:
# # Check the length of the extracted text
# extracted_text = get_extracted_text_from_data()

# word_count = len(extracted_text.split())
# print(f"Total words: {word_count}")


Total words: 23037


### Chunking & Database Creation

In [3]:
def get_chunks_from_text(extracted_text):
    """
    Split the extracted text into chunks.
    """
    # Initialize the text splitter with desired chunk size and overlap
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, 
        chunk_overlap=50)
    
    # Split the text into chunks
    chunks = text_splitter.split_text(extracted_text)
    
    return chunks

In [None]:
# # Check the number of chunks created
# chunks = get_chunks_from_text(extracted_text)
# print(f"Number of chunks: {len(chunks)}")

Number of chunks: 340


In [4]:
# Method 1: Save the chunks to a csv file
def save_chunks_to_csv(chunks):
    """
    Save the chunks to a file.
    """
    # Create the dataframe with the chunks
    df = pd.DataFrame(chunks, columns=["text"])
    # Save the chunks to a csv dafaframe
    df.to_csv("data/chunks.csv")
    
# Method 2: Get the chunks from the csv file
def get_chunks_from_csv():
    """
    Get the chunks from the csv file.
    """
    # Read the csv file
    df = pd.read_csv("data/chunks.csv")
    # Get the chunks from the dataframe
    chunks = df["text"].tolist()
    
    return chunks

In [None]:
# save_chunks_to_csv(chunks)
# chunks = get_chunks_from_csv()

### Transform chunks to vectors for FAISS retrieval

In [5]:
# Load Model function
def load_model():
    """
    Load the SentenceTransformer model.
    """
    # Load the SentenceTransformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    return model

# Embedding each chunk of text
def embed_chunks(chunks, model):
    """
    Embed the chunks of text.
    """
    # Encode the chunks
    embeddings = model.encode(chunks)
    
    return embeddings


In [6]:
# Create faiss index and store the embeddings
def create_faiss_index(embeddings):
    """
    Create faiss index and store the embeddings
    """

    # Store in FAISS index
    dim = len(embeddings[0]) # Dimension of the embedding
    index = faiss.IndexFlatL2(dim) # L2 distance index
    
    return index

In [None]:
# # Load the model
# model = load_model()

In [None]:
# embeddings = embed_chunks(chunks, model) # Embed the chunks
# index = create_faiss_index(embeddings) # Create the faiss index

# print(index.is_trained) # Check if the index is trained
# index.add(np.array(embeddings)) # Add the embeddings to the index
# print(index.ntotal) # Number of vectors in the index

True
340


In [None]:
# # Save the index to a file
# faiss.write_index(index, "data/faiss_index.index") # Save the index to a file

In [7]:
# Load the faiss index from a file   
def load_faiss_index():
    """
    Load the faiss index from a file.
    """
    index = faiss.read_index("data/faiss_index.index") # Load the index from a file
    return index

### Retrieving form FAISS

In [8]:
# Get top k most similar chunks from the faiss index using a query
def get_top_k_faiss(index, model, query, k):
    """
    Get the top k most similar chunks from the faiss index.
    """
    # Search for the top k most similar chunks
    results = index.search(model.encode([query]), k=k)
    
    return results


##### Chech results from FAISS retrieval

In [None]:
# # Declare given queries
# query1 = "What features does MATLAB offer to help shorten response times and reduce data transmission over the network?"
# query2 = "How did Baker Hughes engineers use MATLAB to develop pump health monitoring software?"
# query3 = "Why is it important for training data in predictive maintenance systems to include instances from both normal and fault conditions?"
# query4 = "What is the recall performance of the proposed ENBANN method in comparison to other methods?"
# query5 = "What is cross-sectional prediction and how can it be applied in estimating component lifespan?"
# query6 = "Why are gas leak detectors important in environments with many pneumatic valves, and what type of detectors are considered non-intrusive?"
# query7 = "What new Industry 4.0 technologies are being used for remote asset monitoring, and what tools support them?"
# query8 = "What does the simulation model of the SUDM policy evaluate, and what assumptions are made about workstation operations?"
# query9 = "How were the prior parameters for the Weibull and exponential degradation models estimated, and what assumptions were made about the error terms?"
# query10 = "How does fuzzy logic contribute to diagnostics in machine failure and maintenance management?"
# query11 = "Why are artificial neural networks suitable for prognostics in machine failure, and what limitations do traditional systems face?"
# query12 = "How do Big Data platforms and CMMS contribute to the formulation of maintenance strategies?"
# query13 = "What is the relationship between diagnostics and prognostics in the context of machine degradation and failure?"

In [None]:
# # Check results for query1
# results = get_top_k_faiss(index, model, query1, k=5) # Get the top 5 most similar chunks

# print(results) # Print the results
# # Print the chuncks of the 5 most similar chunks

# print((results[1][0].tolist())) # Print the indices of the most similar chunks

(array([[0.97686744, 1.0720754 , 1.0859646 , 1.110936  , 1.130651  ]],
      dtype=float32), array([[ 8,  4,  5,  3, 14]], dtype=int64))
[8, 4, 5, 3, 14]


### Retreaving with BM25

In [10]:
# Get BM25 retriever
def get_bm25_retriever(chunks):
    """
    Get BM25 retriever.
    """
    # Tokenize the chunks
    tokenized_corpus = [doc.split(" ") for doc in chunks]
    
    # Create the BM25 retriever
    retriever = BM25Okapi(tokenized_corpus)
    
    return retriever

In [17]:
# Get results with BM25 method
def get_top_k_bm25(retriever, query, chunks, k):
    """
    Get the top k most similar chunks from the BM25 retriever.
    """
    # Tokenize the query
    tokenized_query = query.split(" ")
    
    # Get the top k most similar chunks
    results = retriever.get_top_n(tokenized_query, chunks, n=k)
    
    return results


##### Chech results from BM25 retrieval

In [None]:
# retriever = get_bm25_retriever(chunks) # Get the BM25 retriever

In [None]:
# results_bm = get_top_k_bm25(retriever, query1, k=5) # Get the top 5 most similar chunks

# # Print the results
# print(results_bm[0])

# index_bm = [chunks.index(result) for result in results_bm]

# print(index_bm)

Read user story 
Develop Predictive Models
Interactively train and evaluate predictive models using the 
Classification Learner app.
6
Predictive Maintenance with MATLAB
Once you’ve developed your models, you want to get them up 
and running as quickly as possible.  MATLAB integrates into 
enterprise systems, clusters, and clouds, and can be targeted to 
real-time embedded hardware.
To shorten response times and send less data over the network, 
you can deploy the models directly on machines.
[8, 159, 106, 6, 20]


### Compare 2 results and get final retrieved chunks

In [30]:
def get_top_5_chunks(chunks, query, index, model, retriever, k):
    """
    Get the top k most similar chunks from the chunks.
    """
    
    results_faiss = get_top_k_faiss(index, model, query, k*2) # Get the top k most similar chunks from the faiss index
    index_faiss = results_faiss[1][0].tolist() # Get the indexes of the chunks from the faiss retriever
    
    results_bm = get_top_k_bm25(retriever, query, chunks, k*2) # Get the top k most similar chunks from the BM25 retriever
    index_bm = [chunks.index(result) for result in results_bm] # Get the indexes of the chunks from the BM25 retriever
    
    indexes = list(filter(lambda x: x in index_faiss, index_bm)) # Get the common indexes from both retrievers
    
    # Handle the case when there are less than 5 common indexes
    if len(indexes) < 5:
        return get_top_5_chunks(chunks, query, index, model, retriever, k+5) # If there are less than 5 common indexes, get more chunks
    
    indexes = indexes[:5] # Get the top 5 indexes
    
    chunks = [chunks[i] for i in indexes] # Get the chunks from the indexes
    
    return chunks, indexes # Return the chunks and indexes
    

In [None]:
# chunks = get_chunks_from_csv() # Get the chunks from the csv file

# chunks_res, index_res = get_top_5_chunks(chunks, query1, k=5) # Get the top 5 most similar chunks from the chunks

# print(index_res) # Print the indexes of the most similar chunks

In [None]:
# def __main__():
#     # Get the chunks from the csv file
#     chunks = get_chunks_from_csv()
    
#     # Load the model
#     model = load_model()
    
#     # Load faiss index
#     index = load_faiss_index()
    
#     # Get BM25 retriever
#     retriever = get_bm25_retriever(chunks)
    
#     chunks_res, chunks_index = get_top_5_chunks(chunks, query2, index, model, retriever, k=5) # Get the top 5 most similar chunks from the chunks
    
#     # Print the results
#     print(f"Indexes: {chunks_index}") # Print the indexes of the most similar chunks
#     print("Top 5 chunks:")
#     for i, chunk in enumerate(chunks_res):
#         print(f"Chunk {i+1}: {chunk}")
    
    
    
# if __name__ == "__main__":
#     __main__()

Indexes: [12, 11, 10, 14, 13]
Top 5 chunks:
Chunk 1: needed to determine when a pump was about to fail. Too-frequent 
maintenance wasted effort and resulted in still-usable parts being 
replaced, while too-infrequent maintenance risked damaging 
pumps beyond repair.
Working in MATLAB, Baker Hughes engineers developed 
pump health monitoring software that applies machine learning 
techniques in real time to predict the ideal time to perform 
maintenance. They processed and analyzed up to a terabyte
Chunk 2: 7
Predictive Maintenance with MATLAB
Industry Example
Truck with positive displacement pump.
Baker Hughes Develops Predictive Maintenance Software for Gas 
and Oil Extraction Equipment
Baker Hughes trucks are equipped with positive displacement 
pumps that inject a mixture of water and sand at high pressures 
deep into drilled wells. With pumps accounting for about 
$100,000 of the $1.5 million total cost of the truck, Baker Hughes
Chunk 3: machines nonstop, even on Christmas, and we

In [None]:
def ask(query):
    """
    Ask a question and get the answer.
    """
    # Get the chunks from the csv file
    chunks = get_chunks_from_csv()
    
    # Load the model
    model = load_model()
    
    # Load faiss index
    index = load_faiss_index()
    
    # Get BM25 retriever
    retriever = get_bm25_retriever(chunks)
    
    chunks_res, chunks_index = get_top_5_chunks(chunks, query, index, model, retriever, k=5) # Get the top 5 most similar chunks from the chunks
    
    for i, chunk in enumerate(chunks_res):
        chunks_res[i] = "Chunk " + str(i+1) + ":\n" + chunk + "\n"# Add the chunk number to the chunk
    
    # chunks_res = "\n".join(chunks_res) # Join the chunks to get the final result
    
    return chunks_res

'Chunk 1\nneeded to determine when a pump was about to fail. Too-frequent \nmaintenance wasted effort and resulted in still-usable parts being \nreplaced, while too-infrequent maintenance risked damaging \npumps beyond repair.\nWorking in MATLAB, Baker Hughes engineers developed \npump health monitoring software that applies machine learning \ntechniques in real time to predict the ideal time to perform \nmaintenance. They processed and analyzed up to a terabyte\n\nChunk 2\n7\nPredictive Maintenance with MATLAB\nIndustry Example\nTruck with positive displacement pump.\nBaker Hughes Develops Predictive Maintenance Software for Gas \nand Oil Extraction Equipment\nBaker Hughes trucks are equipped with positive displacement \npumps that inject a mixture of water and sand at high pressures \ndeep into drilled wells. With pumps accounting for about \n$100,000 of the $1.5 million total cost of the truck, Baker Hughes\n\nChunk 3\nmachines nonstop, even on Christmas, and we rely on our MATLAB b