# **Langchain**

In [1]:
!pip install transformers langchain langchain-community PyPDF2 faiss-cpu sentence-transformers accelerate pdfminer.six

Collecting langchain
  Downloading langchain-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.0-py3-none-any.whl.metadata (2.8 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting langchain-core<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_core-0.3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.125-py3-none-any.whl.metadata (13 k

In [2]:
# Cell 2: Verify installations and check GPU availability
import transformers
import langchain
import PyPDF2
import faiss
import sentence_transformers
import accelerate
import pdfminer
import torch

print("All libraries imported successfully!")

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using GPU for computations.")
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU for computations.")


All libraries imported successfully!
GPU is available. Using GPU for computations.


In [3]:
# Cell 3: Upload PDF to Colab
from google.colab import files

# Prompt the user to upload a PDF file
uploaded = files.upload()

# Assuming you uploaded 'Linux-Tutorial.pdf', replace with your file name if different
import os

# List uploaded files
for filename in uploaded.keys():
    print(f'User uploaded file "{filename}" with length {len(uploaded[filename])} bytes')

# Set the PDF path (modify if your file has a different name)
pdf_path = list(uploaded.keys())[0]


Saving NIPS-2017-attention-is-all-you-need-Paper.pdf to NIPS-2017-attention-is-all-you-need-Paper.pdf
User uploaded file "NIPS-2017-attention-is-all-you-need-Paper.pdf" with length 569417 bytes


In [4]:
# Cell 4: Define helper functions for PDF processing and text chunking
import PyPDF2
from pdfminer.high_level import extract_text

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Extracted text from the PDF.
    """
    try:
        # Try using PyPDF2 first
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + "\n"
        if not text.strip():
            # If PyPDF2 fails to extract text, fallback to pdfminer.six
            print("PyPDF2 failed to extract text. Using pdfminer.six as fallback.")
            text = extract_text(pdf_path)
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

def chunk_text(text, max_chars=1000):
    """
    Splits text into smaller chunks based on the maximum number of characters.

    Args:
        text (str): The text to be chunked.
        max_chars (int, optional): Maximum number of characters per chunk. Defaults to 1000.

    Returns:
        list: A list of text chunks.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        word_length = len(word) + 1  # +1 for space
        if current_length + word_length > max_chars:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


In [5]:
# Cell 5: Extract text from PDF and create chunks
# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)
print(f"Extracted {len(pdf_text)} characters from the PDF.")

# Chunk the text
chunks = chunk_text(pdf_text, max_chars=1000)
print(f"Total chunks created: {len(chunks)}")

# (Optional) Display the first chunk for verification
print("\nFirst chunk of text:")
print(chunks[0] if chunks else "No chunks created.")


Extracted 32556 characters from the PDF.
Total chunks created: 33

First chunk of text:
Attention Is All You Need Ashish Vaswani Google Brain avaswani@google.comNoam Shazeer Google Brain noam@google.comNiki Parmar Google Research nikip@google.comJakob Uszkoreit Google Research usz@google.com Llion Jones Google Research llion@google.comAidan N. Gomezy University of Toronto aidan@cs.toronto.eduŁukasz Kaiser Google Brain lukaszkaiser@google.com Illia Polosukhinz illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while b

In [6]:
# Cell 6: Initialize embeddings and create FAISS vector store
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Initialize Hugging Face embeddings using a pre-trained model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a FAISS vector store from the text chunks
vector_store = FAISS.from_texts(chunks, embedding_model)
print("FAISS vector store created successfully.")



  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS vector store created successfully.


In [7]:
# Cell 17: Connect to Hugging Face

# Install the huggingface_hub library if not already installed
!pip install --upgrade huggingface_hub

from huggingface_hub import notebook_login

# Prompt the user to log in to Hugging Face
notebook_login()


Collecting huggingface_hub
  Downloading huggingface_hub-0.25.1-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.25.1-py3-none-any.whl (436 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m436.4/436.4 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.24.7
    Uninstalling huggingface-hub-0.24.7:
      Successfully uninstalled huggingface-hub-0.24.7
Successfully installed huggingface_hub-0.25.1


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
# Cell 7: Load Mistral 7B model and set up the text generation pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Specify the Mistral 7B model identifier from Hugging Face
model_name = "mistralai/Mistral-7B-Instruct-v0.1"  # Replace with the correct model name if different

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model with appropriate settings
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

# Set up the text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device.index if device.type == 'cuda' else -1,  # Set device index for GPU or -1 for CPU
    max_new_tokens=200,  # Adjust as needed
    temperature=0.7,      # Adjust for creativity
    top_p=0.9,
    do_sample=True
)

print("Mistral 7B model loaded and text generation pipeline configured.")


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



Mistral 7B model loaded and text generation pipeline configured.


In [10]:
# Cell 8: Wrap the pipeline with LangChain's HuggingFacePipeline
from langchain.llms import HuggingFacePipeline

# Wrap the Hugging Face pipeline with LangChain
local_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

print("HuggingFacePipeline wrapped with LangChain.")


HuggingFacePipeline wrapped with LangChain.


  local_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [11]:
# Cell 9: Create the RetrievalQA chain using the local LLM and FAISS vector store
from langchain.chains import RetrievalQA

# Create a RetrievalQA chain using the 'stuff' chain type
retrieval_qa = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 5})  # Retrieve top 5 relevant chunks
)

print("RetrievalQA chain created successfully.")


RetrievalQA chain created successfully.


In [12]:
# Cell 10: Define the enhanced prompt template
from langchain.prompts import PromptTemplate

# Define a detailed and instructive prompt template
template = """
You are an intelligent assistant trained to answer questions based solely on the provided context.

**Instructions:**
1. **Always** base your answers strictly on the information contained within the context.
2. If the query **directly relates** to the context, provide a detailed and accurate answer.
3. If the query **does not relate** to the context, respond with: "I don't know" or "I'm sorry, but I cannot provide information on that topic."

**Context:**
{context}

**Query:**
{question}

**Response:**
"""

# Create a PromptTemplate instance with the new template
prompt_template = PromptTemplate(input_variables=["context", "question"], template=template)

print("Enhanced prompt template defined.")


Enhanced prompt template defined.


In [13]:
# Cell 7a: Define the truncate_context function

def truncate_context(context, tokenizer, max_tokens=4096 - 200):
    """
    Truncates the context to ensure it does not exceed the specified token limit.

    Args:
        context (str): The context string to be truncated.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to encode the context.
        max_tokens (int, optional): The maximum number of tokens allowed. Defaults to 3896.

    Returns:
        str: The truncated context string.
    """
    # Encode the context without truncation to get the full token list
    tokens = tokenizer.encode(context, add_special_tokens=False)

    # Check if the number of tokens exceeds the maximum allowed
    if len(tokens) > max_tokens:
        # Keep only the last 'max_tokens' tokens to preserve the most recent context
        tokens = tokens[-max_tokens:]
        # Decode the tokens back to a string, skipping special tokens
        context = tokenizer.decode(tokens, skip_special_tokens=True)

    return context


In [14]:
# Cell 11: Define a function to process queries using the RetrievalQA chain
def process_query(query):
    """
    Processes a user query by retrieving relevant context and generating an answer.

    Args:
        query (str): The user's question.

    Returns:
        str: The generated answer.
    """
    # Retrieve top 5 relevant chunks
    retrieved_chunks = retrieval_qa.retriever.get_relevant_documents(query)[:5]
    context = "\n".join([doc.page_content for doc in retrieved_chunks])

    # Truncate context if necessary
    context = truncate_context(context, tokenizer, max_tokens=4096 - 200)  # Adjust based on model's max tokens

    # Format the prompt
    formatted_query = prompt_template.format(context=context, question=query)

    # Perform Retrieval-Augmented Generation (RAG) on the formatted query
    result = retrieval_qa.run(formatted_query)

    return result
    print(result)


In [15]:


# Example usage
query = "What is transformers?"
answer = process_query(query)
print(answer)


  retrieved_chunks = retrieval_qa.retriever.get_relevant_documents(query)[:5]
  result = retrieval_qa.run(formatted_query)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section 3.2. Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 22, 23, 19]. End-to-end memory networks are based on a recurrent attention mechanism instead of sequence- aligned recurrence and have been shown to perform well on simple-language question answering and languag

# **LlamaIndex**

In [33]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="QuantFactory/Qwen2.5-0.5B-Instruct-GGUF",
	filename="Qwen2.5-0.5B-Instruct.Q2_K.gguf",
)

llm.create_chat_completion(
	messages = [
		{
			"role": "user",
			"content": "what is the most big country in the world ?"
		}
	]
)


llama_model_loader: loaded meta data with 32 key-value pairs and 290 tensors from /root/.cache/huggingface/hub/models--QuantFactory--Qwen2.5-0.5B-Instruct-GGUF/snapshots/be5d6f897c45491ea16c56f84f652010c6518507/./Qwen2.5-0.5B-Instruct.Q2_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Models
llama_model_loader: - kv   3:                         general.size_label str              = 494M
llama_model_loader: - kv   4:                            general.license str              = apache-2.0
llama_model_loader: - kv   5:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-0...
ll

{'id': 'chatcmpl-f315d82e-b87f-4552-a13a-1990a529e9df',
 'object': 'chat.completion',
 'created': 1727114497,
 'model': '/root/.cache/huggingface/hub/models--QuantFactory--Qwen2.5-0.5B-Instruct-GGUF/snapshots/be5d6f897c45491ea16c56f84f652010c6518507/./Qwen2.5-0.5B-Instruct.Q2_K.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': "As an AI developed by Alibaba Cloud, I'm here to provide information and not to make predictions about the world's most populous countries. The most populous country in the world is India, with approximately 1.2 billion people. India is a diverse country with a rich cultural and linguistic diversity, and has a population growth rate of approximately 2.5% annually."},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 39, 'completion_tokens': 74, 'total_tokens': 113}}

In [39]:
!pip install PyPDF2



In [42]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

pdf_path = "/content/NIPS-2017-attention-is-all-you-need-Paper.pdf"
pdf_text = extract_text_from_pdf(pdf_path)



In [43]:
import re

def preprocess_text(text):
    # Example preprocessing: remove non-alphanumeric characters and split into sentences
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    sentences = text.split('. ')
    return sentences

preprocessed_text = preprocess_text(pdf_text)


In [44]:
def retrieve_relevant_info(sentences, query):
    query_keywords = query.lower().split()
    relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in query_keywords)]
    return relevant_sentences

query = "What is transformer ?"
relevant_info = retrieve_relevant_info(preprocessed_text, query)


In [47]:
from llama_cpp import Llama

def generate_response(model, relevant_info, query, context_window_size=512):
    input_text = " ".join(relevant_info) + " " + query
    # Split the input text into chunks that fit within the context window size
    chunks = [input_text[i:i + context_window_size] for i in range(0, len(input_text), context_window_size)]
    responses = []
    for chunk in chunks:
        response = model.create_chat_completion(
            messages=[
                {
                    "role": "user",
                    "content": chunk
                }
            ]
        )
        responses.append(response['choices'][0]['message']['content'])
    return " ".join(responses)

# Load the model
model = Llama.from_pretrained(
    repo_id="QuantFactory/Qwen2.5-0.5B-Instruct-GGUF",
    filename="Qwen2.5-0.5B-Instruct.Q2_K.gguf",
)

# Generate a response
response = generate_response(model, relevant_info, query)
print(response)


llama_model_loader: loaded meta data with 32 key-value pairs and 290 tensors from /root/.cache/huggingface/hub/models--QuantFactory--Qwen2.5-0.5B-Instruct-GGUF/snapshots/be5d6f897c45491ea16c56f84f652010c6518507/./Qwen2.5-0.5B-Instruct.Q2_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Models
llama_model_loader: - kv   3:                         general.size_label str              = 494M
llama_model_loader: - kv   4:                            general.license str              = apache-2.0
llama_model_loader: - kv   5:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-0...
ll

KeyboardInterrupt: 