In [2]:
!pip install langchain
!pip install chromadb
!pip install llama-cpp-python
!pip install pandas
!pip install PyPDF2
!pip install google-colab

Collecting langchain
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.32 (from langchain)
  Downloading langchain_core-0.2.33-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.101-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.32->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp31

In [6]:
!pip install -U langchain-community



In [6]:

import tempfile
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from google.colab import files
import requests

In [None]:
# Function to download model
def download_model(url, dest_path):
    response = requests.get(url, stream=True)
    with open(dest_path, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)

# Define model URL and path
model_url = "https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q4_K_M.gguf"
model_path = "zephyr-7b-alpha.Q4_K_M.gguf"

# Check if the model is already downloaded
if not os.path.exists(model_path):
    print("Downloading model...")
    download_model(model_url, model_path)
    print("Model downloaded.")

In [None]:


# Prompt templates
prompt_template_questions = """
You are an expert in creating practice questions based on study material.
Your goal is to prepare a student for their exam. You do this by asking questions about the text below:

------------
{text}
------------

Create questions that will prepare the student for their exam. Make sure not to lose any important information.

QUESTIONS:
"""

PROMPT_QUESTIONS = PromptTemplate(template=prompt_template_questions, input_variables=["text"])

refine_template_questions = """
You are an expert in creating practice questions based on study material.
Your goal is to help a student prepare for an exam.
We have received some practice questions to a certain extent: {existing_answer}.
We have the option to refine the existing questions or add new ones.
(only if necessary) with some more context below.
------------
{text}
------------

Given the new context, refine the original questions in English.
If the context is not helpful, please provide the original questions.

QUESTIONS:
"""

REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template_questions,
)



# Input the passage manually
text_question_gen = input("Please enter the passage you want to generate questions from: ")

# Process the input text
text_splitter_question_gen = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=50)
text_chunks_question_gen = text_splitter_question_gen.split_text(text_question_gen)
docs_question_gen = [Document(page_content=t) for t in text_chunks_question_gen]

# Load and initialize the LLM
try:
    # Initialize the LLM for question generation
    llm_question_gen = LlamaCpp(
        streaming=True,
        model_path=model_path,
        temperature=0.75,
        top_p=1,
        verbose=True,
        n_ctx=4096
    )

    # Create the question generation chain
    question_gen_chain = load_summarize_chain(
        llm=llm_question_gen,
        chain_type="refine",
        verbose=True,
        question_prompt=PROMPT_QUESTIONS,
        refine_prompt=REFINE_PROMPT_QUESTIONS
    )

    # Generate questions from the text
    questions = question_gen_chain.run(docs_question_gen)

    # Initialize the LLM for answer generation
    llm_answer_gen = LlamaCpp(
        streaming=True,
        model_path=model_path,
        temperature=0.75,
        top_p=1,
        verbose=True,
        n_ctx=4096
    )

    # Create embeddings and vector store
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
    vector_store = Chroma.from_documents(docs_question_gen, embeddings)

    # Create the QA chain
    answer_gen_chain = RetrievalQA.from_chain_type(
        llm=llm_answer_gen,
        chain_type="stuff",
        retriever=vector_store.as_retriever(k=2)
    )

    question_list = questions.split("\n")

    question_answer_pairs = []

    for question in question_list:
        if question.strip():  # Avoid processing empty questions
            print("Question: ", question)
            answer = answer_gen_chain.run(question)
            question_answer_pairs.append([question, answer])
            print("Answer: ", answer)
            print("--------------------------------------------------\n\n")

    # Save the questions and answers to a CSV file
    answers_dir = os.path.join(tempfile.gettempdir(), "answers")
    os.makedirs(answers_dir, exist_ok=True)

    qa_df = pd.DataFrame(question_answer_pairs, columns=["Question", "Answer"])
    csv_file_path = os.path.join(answers_dir, "questions_and_answers.csv")
    qa_df.to_csv(csv_file_path, index=False)

    # Download the CSV file
    files.download(csv_file_path)

except Exception as e:
    print(f"An error occurred: {e}")


Downloading model...
