In [None]:
# %pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Setups

### imports

In [2]:
import torch
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os
import json

### Cuda Setup

In [2]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device name: {torch.cuda.get_device_name()}")
print(f"Device memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"Number of devices: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device()}")

os.environ["TOKENIZERS_PARALLELISM"] = "false"

CUDA available: True
Device name: NVIDIA GeForce RTX 2060
Device memory: 6.44 GB
Number of devices: 1
Current device: 0


### Lang-smith setup

In [None]:
from dotenv import load_dotenv
load_dotenv()

os.environ["LANGSMITH_TRACING_V2"]="true"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGSMITH_PROJECT"]="AnimeRAGchain"

NameError: name 'os' is not defined

# Data Loading

In [63]:
def load_json_documents(directory_path):
    """Load JSON documents with UTF-8 encoding"""
    documents = []
    
    # Load all JSON files in directory
    import glob
    json_files = glob.glob(f"{directory_path}/**/*.json", recursive=True)
    
    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                
                # Handle different JSON structures
                if isinstance(data, list):
                    # If JSON is an array of objects
                    for i, item in enumerate(data):
                        # Extract text content (adjust keys based on your JSON structure)
                        text_content = ""
                        if isinstance(item, dict):
                            # Common text fields - adjust based on your JSON structure
                            text_fields = ['text', 'content', 'description', 'body', 'message', 'title', "synopsis"]
                            for field in text_fields:
                                if field in item:
                                    text_content += f"{field}: {item[field]}\n"
                            
                            # If no standard fields, concatenate all string values
                            if not text_content:
                                for key, value in item.items():
                                    if isinstance(value, str):
                                        text_content += f"{key}: {value}\n"
                        
                        # Create document
                        from langchain.schema import Document
                        doc = Document(
                            page_content=text_content.strip(),
                            metadata={"source": file_path, "index": i, "title": item["title"]}
                        )
                        documents.append(doc)
                
                elif isinstance(data, dict):
                    # If JSON is a single object
                    text_content = ""
                    text_fields = ['text', 'content', 'description', 'body', 'message', 'title', "synopsis"]
                    for field in text_fields:
                        if field in data:
                            text_content += f"{field}: {data[field]}\n"
                    
                    # If no standard fields, concatenate all string values
                    if not text_content:
                        for key, value in data.items():
                            if isinstance(value, str):
                                text_content += f"{key}: {value}\n"
                    
                    doc = Document(
                        page_content=text_content.strip(),
                        metadata={"source": file_path, "title": item.get("title", "Untitled")}
                    )
                    documents.append(doc)
                    
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
    
    return documents

In [92]:
from langchain_community.document_loaders import JSONLoader
from langchain.document_loaders import DirectoryLoader

In [None]:
loader = DirectoryLoader("../", glob="**/*.json", use_multithreading=True, show_progress=True)
documents = loader.load()
print(f"Loaded {len(documents)} documents from JSON files.")

Error loading file ..\AnimeRAG\anime_dataset_10k\10075_Naruto x UT.json
Error loading file ..\AnimeRAG\anime_dataset_10k\10029_Coquelicot-zaka kara.json
Error loading file ..\AnimeRAG\anime_dataset_10k\10090_Koukaku Kidoutai- Stand Alone Complex - Solid State Society 3D.json
Error loading file ..\AnimeRAG\anime_dataset_10k\1000_Uchuu Kaizoku Captain Herlock.json
Error loading file ..\AnimeRAG\anime_dataset_10k\1010_Ranma ½- Chou Musabetsu Kessen! Ranma Team vs. Densetsu no Houou.json
Error loading file ..\AnimeRAG\anime_dataset_10k\1014_Chicchana Yukitsukai Sugar Specials.json
Error loading file ..\AnimeRAG\anime_dataset_10k\10092_Break Blade Movie 6- Doukoku no Toride.json
Error loading file ..\AnimeRAG\anime_dataset_10k\10152_Kimi ni Todoke- Kataomoi.json
Error loading file ..\AnimeRAG\anime_dataset_10k\10178_Otona Joshi no Anime Time.json
Error loading file ..\AnimeRAG\anime_dataset_10k\10218_Berserk- Ougon Jidai-hen I - Haou no Tamago.json
Error loading file ..\AnimeRAG\anime_datas

ImportError: unstructured package not found, please install it with `pip install unstructured`



In [91]:
documents = load_json_documents("./anime_dataset_10k/")
print(f"Loaded {len(documents)} documents from JSON files.")

Error loading ./anime_dataset_10k\1000_Uchuu Kaizoku Captain Herlock.json: cannot access local variable 'Document' where it is not associated with a value
Error loading ./anime_dataset_10k\10029_Coquelicot-zaka kara.json: cannot access local variable 'Document' where it is not associated with a value
Error loading ./anime_dataset_10k\10075_Naruto x UT.json: cannot access local variable 'Document' where it is not associated with a value
Error loading ./anime_dataset_10k\10090_Koukaku Kidoutai- Stand Alone Complex - Solid State Society 3D.json: cannot access local variable 'Document' where it is not associated with a value
Error loading ./anime_dataset_10k\10092_Break Blade Movie 6- Doukoku no Toride.json: cannot access local variable 'Document' where it is not associated with a value
Error loading ./anime_dataset_10k\100_Shin Shirayuki-hime Densetsu Prétear.json: cannot access local variable 'Document' where it is not associated with a value
Error loading ./anime_dataset_10k\1010_Ranma 

In [56]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
)

In [80]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len
)
splits = text_splitter.split_documents(documents)
print(f"Loaded {len(documents)} documents and created {len(splits)} chunks")

Loaded 5 documents and created 9 chunks


In [81]:
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

In [82]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

## Model

In [27]:
model_id = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [33]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  
    low_cpu_mem_usage=True,  
    device_map="cuda:0",
    trust_remote_code=True
)

print(f"Model loaded on device: {next(model.parameters()).device}")

text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id,
    return_full_text=False
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded on device: cuda:0


In [34]:
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

## prompt

In [48]:
prompt = ChatPromptTemplate.from_template("""
You are a helpful assistant that answers questions based on the provided context. 
Use only and only the information from the context to answer the question. If the answer is not in the context, say so and don't hullcinate.

Context: {context}

Question: {question}

Answer:""")

## Post processing

In [51]:
def format_docs(docs, max_chars=1000):
    context = "\n\n".join(
        f"title: {doc.metadata.get('title', 'Untitled')}\n{doc.page_content}" for doc in docs
    )
    return context[:max_chars] + "..." if len(context) > max_chars else context

## RAG Chain

In [83]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

## Testing

In [40]:
def cleanup_gpu():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [41]:
def ask_question(question):
    print(f"\nQuestion: {question}")
    print("Generating answer...")
    
    response = rag_chain.invoke(question)
    
    return response

In [90]:
question = "give me a summary of the anime full metal alchemist brotherhood"
response = ask_question(question)
print(f"Answer: {response}")
cleanup_gpu()


Question: give me a summary of the anime full metal alchemist brotherhood
Generating answer...
Answer:  Fullmetal Alchemist: Brotherhood is an anime series that follows the story of two brothers, Edward and Alphonse Elric, who are alchemists searching for the Philosopher's Stone. During their journey, they encounter various friends and foes, and face moral dilemmas that test their resolve and friendship. The series explores themes of sacrifice, redemption, and the consequences of playing God.

