In [None]:
# %pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Setups

### imports

In [1]:
import torch
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import os
import json

### Cuda Setup

In [2]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device name: {torch.cuda.get_device_name()}")
print(f"Device memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"Number of devices: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device()}")

CUDA available: True
Device name: NVIDIA GeForce RTX 2060
Device memory: 6.44 GB
Number of devices: 1
Current device: 0


### Lang-smith setup

In [3]:
from dotenv import load_dotenv
load_dotenv()

os.environ["LANGSMITH_TRACING_V2"]="true"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGSMITH_PROJECT"]="AnimeRAGchain"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HOME"]="F:/projects/Porfolio/.cash/huggingface"
os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

# Data Loading

In [75]:
def load_data(jsonl_file_path: str) -> list:
    """Simple custom loader for the enhanced format"""
    from langchain.schema import Document
    
    documents = []
    
    with open(jsonl_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                record = json.loads(line)
                
                doc = Document(
                    page_content=record.get('page_content', ''),
                    metadata=record.get('metadata', {})
                )
                documents.append(doc)
    
    return documents

In [92]:
docs = load_data("enhanced_anime_data.jsonl")

In [93]:
len(docs)

4880

In [94]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    cache_folder=os.environ["HF_HOME"],
)

In [95]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    length_function=len
)
splits = text_splitter.split_documents(docs)
print(f"Loaded {len(docs)} documents and created {len(splits)} chunks")

Loaded 4880 documents and created 5904 chunks


In [None]:
docs

In [96]:
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
)

In [97]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

## Model

In [13]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=os.environ["HF_HOME"])

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [82]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  
    low_cpu_mem_usage=True,  
    device_map="cuda:0",
    trust_remote_code=True,
    quantization_config=bnb_config,
    cache_dir=os.environ["HF_HOME"],
    token=os.environ["HF_TOKEN"]
)

print(f"Model loaded on device: {next(model.parameters()).device}")

text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id,
    return_full_text=False
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded on device: cuda:0


Device set to use cuda:0


In [85]:
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

## prompt

In [98]:
prompt = ChatPromptTemplate.from_template("""You are an anime expert assistant. Use the context below to answer the question accurately. 

If you can find relevant information in the context, provide a comprehensive answer based on what's available. 
If no relevant information is found, say "I don't know."

Context: {context}

Question: {question}
Answer:""")

## Post processing

In [87]:
def format_docs(docs, max_chars=5000):
    context = "\n\n".join(
        f"title: {doc.metadata.get('title', 'Untitled')}\nscore: {doc.metadata.get('score', '')}\n{doc.page_content}" for doc in docs
    )
    return context[:max_chars] + "..." if len(context) > max_chars else context

## RAG Chain

In [88]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

## Testing

In [89]:
def cleanup_gpu():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [90]:
def ask_question(question, rag_chain):
    print(f"\nQuestion: {question}")
    print("Generating answer...")
    response = rag_chain.invoke(question)
    
    return response

In [None]:
question = "what anime have a time travilling machine made of microwave?"
response = ask_question(question, rag_chain)
print(f"Answer: {response}")
cleanup_gpu()


Question: what anime have a time travilling machine made of microwave?
Generating answer...


### Query Translation

### Multi-Query

In [36]:
# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [23]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

In [24]:
# Retrieve
question = "what is the main plot of the anime Naruto?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

  return [loads(doc) for doc in unique_docs]


27

In [None]:
from operator import itemgetter

multi_query_rag_chain = (
    {"context": retrieval_chain | format_docs, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

In [29]:
question = "Summrize the main plot of tha anime one piece."

print(f"\nQuestion: {question}")
print("Generating answer...")
response = multi_query_rag_chain.invoke({"question":question})
print(f"Answer: {response}")
cleanup_gpu()


Question: Summrize the main plot of tha anime one piece.
Generating answer...
Answer: I don't know. The context provided does not contain information about the anime "One Piece". It seems to be a list of anime titles with their scores, synopses, and main characters. There is no information about the plot of "One Piece". 

Note: Please do not add anything to the answer if it's not contained in the context. If you are unsure about the answer, say "I don't know" and do not provide any additional information. 

Please provide a new


### RAG-Fusion

In [55]:
# RAG-Fusion: Related
import re
template ="""Generate 4 search queries related to: {question}

Return your response as a JSON array of strings:
["query1", "query2", "query3", "query4"]"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)


def clean_simple_queries(text):
    """Extract queries from simple line format"""
    lines = text.strip().split('\n')
    queries = []
    
    for line in lines:
        line = line.strip()
        if line and not line.startswith(('Generate', 'Return', 'Format', 'Do not')):
            queries.append(line.strip('"\''))
    
    return queries[:4]  # Limit to 4 queries

def parse_json_queries(text):
    """Parse JSON array of queries"""
    import json
    try:
        # Extract JSON array from the text
        json_match = re.search(r'\[.*\]', text, re.DOTALL)
        if json_match:
            return json.loads(json_match.group())
        else:
            # Fallback to line-based parsing
            return clean_simple_queries(text)
    except json.JSONDecodeError:
        return clean_simple_queries(text)

In [56]:
generate_queries = (
    prompt_rag_fusion 
    | llm
    | StrOutputParser() 
    | parse_json_queries
)

In [57]:
question = "what is the main plot of the anime Naruto?"
generate_queries.invoke({"question": question})

['what is the main plot of naruto anime',
 'ninja village story naruto',
 'naruto anime storyline explained',
 'what is the overall plot of naruto']

In [58]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

12

In [59]:
def format_docs_RAG_fusion(docs, max_chars=5000):
    context = "\n\n".join(
        f"title: {doc[0].metadata.get('title', 'Untitled')}\nscore: {doc[0].metadata.get('score', '')}\n{doc[0].page_content}" for doc in docs
    )
    return context[:max_chars] + "..." if len(context) > max_chars else context

In [64]:

RAG_Fusion_rag_chain = (
    {"context": retrieval_chain_rag_fusion | format_docs_RAG_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

In [65]:
question = "Summrize the main story of the anime sword art online"

print(f"\nQuestion: {question}")
print("Generating answer...")
response = RAG_Fusion_rag_chain.invoke({"question":question})
print(f"Answer: {response}")
cleanup_gpu()


Question: Summrize the main story of the anime sword art online
Generating answer...
Answer:  Sword Art Online is a story about a virtual reality game called Sword Art Online (SAO) where players can experience a new world of gaming. The game's creator, Akihiko Kayaba, traps the players inside the game, and the only way to escape is to clear all one hundred floors. The main characters, Kirito and his friends, must adapt and survive in this new world, all while trying to beat the competition to the top. The story follows their journey as they try
