In [2]:
import torch
torch.cuda.empty_cache()
!pip install -U bitsandbytes
!pip install transformers datasets accelerate peft datasets
!pip install -qU langchain tiktoken langchain_community langchain_chroma langchain-huggingface huggingface-hub sentence_transformers chromadb langchainhub transformers peft
!pip install flash-attn --no-build-isolation

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version

In [3]:
from langchain_huggingface import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline
)
from peft import PeftModel, PeftConfig
from IPython.display import display, Markdown
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Function to calculate the total number of tokens in the vector database
def count_total_tokens_in_vectorstore(vectorstore, tokenizer):
    # Retrieve all documents from the vector store
    all_docs = vectorstore.get()['documents']

    total_tokens = 0

    # Iterate over each document and calculate the number of tokens
    for doc in all_docs:
        tokens_in_doc = len(tokenizer.encode(doc))  # Tokenize the document content (which is a string)
        total_tokens += tokens_in_doc

    return total_tokens

In [6]:
# Initialize embeddings
embedding_model_name = "BAAI/bge-small-en-v1.5"
embedding_model_kwargs = {"device": "cuda"}
embedding_encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=embedding_model_kwargs,
    encode_kwargs=embedding_encode_kwargs
)

# Initialize vector store and retriever
vectorstore = Chroma(
    persist_directory="/content/drive/MyDrive/RAG/vector1",
    embedding_function=hf
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Check the available device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

# Check for bf16 support
is_bf16_support = False
try:
    tensor_bf16 = torch.tensor([1.0], dtype=torch.bfloat16, device=device)
    is_bf16_support = True
    print("bf16 tensors are supported.")
except TypeError:
    print("bf16 tensors are not supported.")

# Quantization configuration
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the base model and tokenizer
base_model = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Load the fine-tuned Phi3 mini model with LoRA
model = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config,return_dict=True, device_map=device)

qlora_model = PeftModel.from_pretrained(model, "KunalRaghuvanshi/phi3_mini_qlora_chemical_eng")

pipeline = pipeline("text-generation", model=qlora_model, tokenizer=tokenizer, max_new_tokens=count_total_tokens_in_vectorstore(vectorstore, tokenizer)//10)
llm = HuggingFacePipeline(pipeline=pipeline)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



Using device: cuda
bf16 tensors are supported.


tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/2.18k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/31.5M [00:00<?, ?B/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

In [10]:
# Define the RAG Chat Model class
class RAGChatModel:
    def __init__(self, retriever, llm, tokenizer, max_token_limit=count_total_tokens_in_vectorstore(vectorstore, tokenizer)//10):
        self.retriever = retriever
        self.llm = llm
        self.tokenizer = tokenizer
        self.max_token_limit = max_token_limit
        self.current_token_count = 0
        self.template_standard = """
        <|system|>
        Answer the question in detail. Provide all the relevant information based on the provided context.
        It is critical that you mention all page numbers where this information is found. Do not skip any page numbers.


        Context: {context}

        Providing all the page numbers is essential  for the answer.
        <|end|>

        <|user|>
        Question: {question}
        <|end|>

        <|assistant|>
        """
        self.template_exceeded = """
        <|system|>
        Answer the question in detail; warn that information is not taken from the prescribed textbook and must provide the page numbers where they can find the correct information in the prescribed textbook.

        Context: {context}
        Providing all the page numbers is essential for the answer.
        <|end|>

        <|user|>
        Question: {question}
        <|end|>

        <|assistant|>
        """

    def num_tokens_from_string(self, string: str) -> int:
        """Returns the number of tokens in a text string using the tokenizer."""
        return len(self.tokenizer.encode(string))

    def format_docs(self, docs, full_content=True):
        """Format the documents to be used as context in the prompt."""
        if full_content:
            return "\n\n".join(f"Information in Page number: {(doc.metadata['page']+1)}\n{doc.page_content}" for doc in docs)
        else:
            return "Information available in prescribed textbook " + ", ".join(f"Page number: {doc.metadata['page']}" for doc in docs)

    def get_prompt(self, docs, question):
        """Generate the prompt based on token count and context formatting."""
        # Format the context with full content
        context = self.format_docs(docs, full_content=True)
        total_tokens_in_context = self.num_tokens_from_string(context)

        # Add tokens to the running total
        self.current_token_count += total_tokens_in_context

        # Decide whether to use full content or only page numbers
        if self.current_token_count > self.max_token_limit:
            print("Token limit exceeded. Information from prescribed textbook will not be used.")
            # Reformat context to include only page numbers
            context = self.format_docs(docs, full_content=False)
            template = self.template_exceeded
        else:
            template = self.template_standard

        # Create the prompt
        prompt = template.format(context=context, question=question)
        return prompt

    def extract_clean_answer(self, raw_output):
        """Extract only the answer from the raw output."""
        assistant_tag = "<|assistant|>"
        if assistant_tag in raw_output:
            clean_answer = raw_output.split(assistant_tag)[-1].strip()
            return clean_answer
        return raw_output.strip()

    def ask_question(self, question):
        """Main function to retrieve relevant docs and generate a response."""
        # Add fixed request for page numbers to the user's question
        question_with_page_request = f"{question}. Please provide the page numbers in your answer."

        # Retrieve relevant documents
        docs = self.retriever.invoke(question_with_page_request)

        # Generate prompt based on token count
        prompt = self.get_prompt(docs, question_with_page_request)

        # Pass the prompt to the LLM
        result = self.llm.generate([prompt])

        # Extract the generated text
        raw_answer = result.generations[0][0].text

        # Get the clean answer
        clean_answer = self.extract_clean_answer(raw_answer)

        # Display the answer
        display(Markdown(clean_answer))


In [11]:
# Initialize the RAGChatModel
rag_chat_model = RAGChatModel(retriever, llm, tokenizer)

In [9]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.0.2-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.0 (from gradio)
  Downloading gradio_client-1.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.6.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting tomlkit==0.12.0 (from gradio)
  Downloading tomlkit-0.12.0-py3-none-any.whl.metadata (2.7 kB)
Collecting websocket

In [12]:
import gradio as gr

# Function to shorten the question for the chat history display
def get_short_overview(question, answer, max_length=50):
    """Generate a short summary of the question for the chat history."""
    return (question[:max_length] + '...') if len(question) > max_length else question

# Function for the RAG model interaction
def ask_question_gradio(history, question):
    """Main function to retrieve relevant docs and generate a response."""
    if not question:  # Check if the question is empty
        return history, "", ""  # Return empty if no question is asked

    # Add fixed request for page numbers to the user's question
    question_with_page_request = f"{question}. Please provide the page numbers in your answer."

    # Retrieve relevant documents using the RAG model
    docs = rag_chat_model.retriever.invoke(question_with_page_request)
    prompt = rag_chat_model.get_prompt(docs, question_with_page_request)

    # Generate the response
    result = rag_chat_model.llm.generate([prompt])
    raw_answer = result.generations[0][0].text
    clean_answer = rag_chat_model.extract_clean_answer(raw_answer)

    # Add the question and answer to the conversation history as dicts
    history.append({"role": "user", "content": question})  # Add user question
    history.append({"role": "assistant", "content": clean_answer})  # Add model answer

    # Generate a short summary for the chat history section
    short_overview = get_short_overview(question, clean_answer)

    # Format the chat history for display in the overview
    chat_history = "\n\n".join([f"{get_short_overview(q['content'], a['content'])}" for q, a in zip(history[::2], history[1::2])])

    # Return the updated history and chat history for display
    return history, chat_history, ""  # The empty string clears the input box

# Create Gradio Blocks interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        <h1 style='text-align: center;'>Q-ChemNerd</h1>
        <p style='text-align: center;'>Ask any question and get a response from the RAG model.</p>
        """,
    )

    with gr.Row():
        with gr.Column(scale=1, min_width=200):
            gr.Markdown("### Chat History Overview")
            history_display = gr.Textbox(label="Chat History", lines=20, interactive=False)  # Non-editable

        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="QLora ChemNerd Chat", type='messages')  # Ensure type is 'messages'
            user_input = gr.Textbox(placeholder="Ask your question...", label="Type your message here:")
            submit_button = gr.Button("Send")

            history_state = gr.State([])

            submit_button.click(
                ask_question_gradio,
                inputs=[history_state, user_input],
                outputs=[chatbot, history_display, user_input],  # Update chatbot and chat history
                scroll_to_output=True
            )

            user_input.submit(
                ask_question_gradio,
                inputs=[history_state, user_input],
                outputs=[chatbot, history_display, user_input],  # Update chatbot and chat history
                scroll_to_output=True
            )

# Launch the Gradio interface
demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4e2005af651a6e89c9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# Start the interactive chat
print("Welcome to the RAG Chat Model! Ask any question (type 'exit' to quit):")
while True:
  print("\n\n")
  question = input("Your question: ")
  if question.lower() == 'exit':
    print("Exiting the chat.")
    break
  rag_chat_model.ask_question(question)

Welcome to the RAG Chat Model! Ask any question (type 'exit' to quit):



Your question: How many stages are required to distill toluene from decane at atmospheric conditions?


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Information in Page number: 763

To determine the number of stages required to distill toluene from decane at atmospheric conditions, we need to consider the relative volatility of toluene to decane and the desired purity of the toluene product. The relative volatility (α) is a measure of how easily one component can be separated from another in a mixture by distillation and is defined as the ratio of the vapor pressures of the two components.

Given that the relative volatility of toluene to decane is not explicitly provided in the context, we would typically look up this value in a chemical engineering textbook or database. However, for the sake of this exercise, let's assume a hypothetical relative volatility value based on typical values for similar systems.

Assuming a relative volatility (α) of 2.5 for toluene to decane, we can use the Fenske equation to estimate the minimum number of theoretical stages (N_min) required for the distillation process. The Fenske equation is given by:

N_min = log((xD/xB) * (1 - xD)/(xB/xD)) / log(α)

Where:
- N_min is the minimum number of theoretical stages,
- xD is the mole fraction of the more volatile component (toluene) in the distillate,
- xB is the mole fraction of the more volatile component (toluene) in the bottoms,
- α is the relative volatility of toluene to decane.

Assuming a desired distillate purity (xD) of 98% (or 0.98) and a bottoms purity (xB) of 2% (or 0.02), we can plug these values into the Fenske equation:

N_min = log((0.98/0.02) * (1 - 0.98)/(0.02/0.98)) / log(2.5)

Calculating the above expression gives us the minimum number of theoretical stages required for the distillation process. However, without the actual relative volatility value and the specific page numbers where this calculation is detailed, we cannot provide a precise numerical answer.

For a more accurate and detailed calculation, including the specific page numbers where the relative volatility and the Fenske equation application are discussed, please refer to the provided context on Page number: 763. This page likely contains the necessary information to perform the calculation accurately, including the relative volatility of toluene to decane and a step-by-step guide on applying the Fenske equation.

Remember, the actual number of stages required in a real-world distillation process may be higher due to inefficiencies and non-idealities not accounted for in the theoretical minimum. Engineers often design distillation columns with more stages than the minimum calculated to ensure the desired separation efficiency.

In summary, to calculate the number of stages required to distill toluene from decane at atmospheric conditions, one would need to know the relative volatility of toluene to decane and apply the Fenske equation. The specific page numbers (763) in the provided context would guide the reader to the necessary information for performing this calculation.




Your question: What methods are available to capture CO2 from air at dilute concentrations?


The methods available to capture CO2 from air at dilute concentrations are discussed on Page 901. The text mentions that the $\mathrm{C O_{2} / C H_{4}}$ selectivity for various membranes, such as polycarbonate, polysulfone, and cellulose acetate, ranges from 20 to 30 at 35°C and 40 atm. It also states that selectivity over 60 can be achieved with Kapton, although this polymer has much lower permeability. The operating temperature is chosen to be slightly above the dew point of the residue gas to optimize the process. Additionally, the text notes that the permeability of most polymers increases with temperature, but this can lead to a slight decrease in selectivity. The selectivity is also affected by the plasticization effect of CO2, which increases the effective diffusion coefficients for all gases, potentially reducing the selectivity below that of pure-gas data. These details are crucial for understanding the challenges and considerations in designing a system for CO2 capture from air at dilute concentrations.

Page numbers: 901




Your question: To heat a dodecane fluid with steam, what type of heat exchanger should be applied and why?


Based on the provided context, for heating a dodecane fluid with steam, a forced-circulation evaporator should be applied. This recommendation is grounded in the information found on Page number: 491, which discusses the advantages of using a forced-circulation evaporator, particularly when dealing with viscous liquids. The high heat-transfer coefficients associated with forced-circulation evaporators make them highly efficient for such applications. Additionally, the short residence time of the liquid in the tubes (about 1 to 3 seconds) ensures rapid and effective heating, which is ideal for processing dodecane or similar viscous fluids. The use of forced circulation is justified by the need to handle the high viscosity of the final concentrate, as mentioned in the context, which often necessitates the use of expensive materials like nickel in the equipment. This approach not only improves the efficiency of the heating process but also addresses the challenges posed by the physical properties of the fluid being processed.


References:

- Page number: 491. The discussion on the forced-circulation evaporator's efficiency and suitability for viscous liquids, including dodecane, is found here.

- Page number: 516. The context on the application of forced-circulation evaporators for viscous fluids, including the rationale for their use in specific scenarios, is provided.

- Page number: 516. Additional information on the design and operation of forced-circulation evaporators, emphasizing their efficiency in handling viscous liquids, is mentioned.


This detailed explanation, supported by the specific page numbers, outlines the reasons for selecting a forced-circulation evaporator for heating dodecane with steam, highlighting the system's efficiency and suitability for processing viscous fluids.




Your question: what is an ideal gas law?


If you are not using the generate method, you may encounter nonsensical outputs after the 4096th token, as the KV cache needs to be recomputed.
We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)


How do we determine the breakthru point for an absorption bed?

what is entropy from the perspective of a molecule?

