In [37]:
from langchain.document_loaders import PyPDFLoader
from langchain import PromptTemplate, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.callbacks.base import BaseCallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [38]:
import os
current_path = os.getcwd()
documents_path = os.path.join(current_path, "Documents")
lecture_paths = [os.path.join(current_path, "Documents",lecture_name) for lecture_name in os.listdir(documents_path) if lecture_name.endswith('.pdf')]
index_path = os.path.join(current_path,"index")
gpt4all_path = os.path.join(current_path,"models","mistral-7b-openorca.Q4_0.gguf")

In [39]:
os.path.exists(gpt4all_path)

True

In [40]:
texts = []
for lecture_path in lecture_paths:
    documents = PyPDFLoader(lecture_path).load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,chunk_overlap=64)
    texts = texts + text_splitter.split_documents(documents)
print(len(texts))
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
faiss_index = FAISS.from_documents(texts, embeddings)
faiss_index.save_local(index_path)

412


In [41]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [42]:
# load vector store
print("loading indexes")
faiss_index = FAISS.load_local(index_path, embeddings)
print("index loaded")


loading indexes
index loaded


In [43]:
question = "What are the differences between MAR, MCAR, and MNAR"
matched_docs = faiss_index.similarity_search(question, 7)
context = ""
for doc in matched_docs:
    context = context + doc.page_content + " \n\n "

In [44]:
print(len(context))

4176


In [45]:
print(context[:1000])

○Missing Completely At Random (MCAR ) –there is no relationship between the missing 
data mechanism and any values, observed or missing
•Probability of being missing is the same for all values
•e.g. if weighing scale ran out of battery, weight attribute has values MCAR
○Missing At Random (MAR ) –there is a systematic relationship between the propensity of 
missing values and the observed data , but notthe missing data
•missingness can be explained by variables on which you have full information
•e.g. if men are more likely to tell their weight than women, weight is MAR (what is the 
observed variable here?)
○MCAR and MAR are ignorable –enough information is available in the data to allow 
imputing missing values, therefore the missing data mechanism can be ignored
11
Data Engineering -Data Preprocessing I © M.Abuelkheir, GUCTypes of Missing Values 

 47
Data Engineering -ML Primer © M.Abuelkheir, GUCClass Imbalance –A Problem!
Yes No Total 
Yes 90 210 300
No 140 9560 9700
Total 2309770

In [50]:
template = """
Please use the following context to answer questions.
Context: {context}
 - -
Question: {question}
Explain the concept clearly and then give your own analogy
 """

In [47]:
callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
llm = GPT4All(model=gpt4all_path, callback_manager=callback_manager, verbose=True,repeat_last_n=0)
prompt = PromptTemplate(template=template, input_variables=["context", "question"]).partial(context=context)
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [51]:
llm_chain.run(question)


Answer: The differences between MAR, MCAR, and MNAR are as follows:

1. Missing Completely At Random (MCAR): In this case, there is no relationship between the missing data mechanism and any values, observed or missing. The probability of being missing is the same for all values. For example, if a weighing scale runs out of battery, the weight attribute has values MCAR.

2. Missing At Random (MAR): There is a systematic relationship between the propensity of missing values and the observed data, but not the missing data. Missingness can be explained by variables on which you have full information. For example, if men are more likely to tell their weight than women, weight is MAR (what is the observed variable here?).

3. Missing Not at Random (MNAR): There is a relationship between the propensity of a value to be missing and its values. The probability of being missing varies for reasons that are unknown to us. For example, people with the lowest education have missing education level

'\nAnswer: The differences between MAR, MCAR, and MNAR are as follows:\n\n1. Missing Completely At Random (MCAR): In this case, there is no relationship between the missing data mechanism and any values, observed or missing. The probability of being missing is the same for all values. For example, if a weighing scale runs out of battery, the weight attribute has values MCAR.\n\n2. Missing At Random (MAR): There is a systematic relationship between the propensity of missing values and the observed data, but not the missing data. Missingness can be explained by variables on which you have full information. For example, if men are more likely to tell their weight than women, weight is MAR (what is the observed variable here?).\n\n3. Missing Not at Random (MNAR): There is a relationship between the propensity of a value to be missing and its values. The probability of being missing varies for reasons that are unknown to us. For example, people with the lowest education have missing educati