# Integrated the trained conversational model with RAG

- **Authors:** Riyaadh Gani and Damilola Ogunleye
- **Project:** Food Recognition & Recipe LLM  
- **Purpose:** Creating VectorDB of recipe data and combining with RAG for the model

---

## Overview

This notebook is used for inference of our conversational model with our RAG pipeline

**Output:** Functional model for recipe support: based on Recipe NLG data

In [1]:
%pip install pandas numpy faiss-cpu sentence_transformers transformers torch peft==0.11.1 tqdm

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import tqdm


  from .autonotebook import tqdm as notebook_tqdm


## Load the Model
Memory management is not easy! so load the model and then change to GPU to free up CPU RAM --> then load the data and the index

In [None]:
# Load conversational finetuned GPT-2 model
model_path = '../finetune_llm/models/base/gpt2-medium'   
tokenizer = AutoTokenizer.from_pretrained(model_path)
base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    dtype=torch.float16,  # Half precision
    low_cpu_mem_usage=True
)

Have to load the base model + the adapter to actually access the model

In [None]:
adapter_path = '../finetune_llm/models/gpt2-conversational-v1/final'
print(f"Loading adapter from: {adapter_path}")
conversational_model = PeftModel.from_pretrained(base_model, adapter_path)

In [None]:
# Set pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Move to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
conversational_model = conversational_model.to(device)
conversational_model.eval()

print(f"Model loaded on {device}")
print(f"Recipes: {len(df)}, Index size: {index.ntotal}")

In [2]:
small = True  # Set to True to use a smaller dataset for testing

# Load the recipe data
df = pd.read_csv('../datasets/Cleaned/clean_recipes.csv')
print(f"Loaded {len(df)} recipes")

# trim to first 10000 entries to match index
if small == True:
    df = df.head(10000)
    print(f"Trimmed to {len(df)} recipes for small dataset")

Loaded 5155414 recipes
Trimmed to 10000 recipes for small dataset


In [3]:
# Load the FAISS index
if small:
    index = faiss.read_index('../VectorDB/recipe_index_xsmol.faiss')
else:
    index = faiss.read_index('../VectorDB/recipe_index.faiss')
print(f"Loaded index with {index.ntotal} vectors")

Loaded index with 10000 vectors


In [4]:
# Load embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Loaded embedding model")

Loaded embedding model


Define functions for rag implementation

In [None]:
def retrieve_recipes(query, k=3):
    """Retrieve top-k similar recipes"""
    q_emb = embedding_model.encode([query]).astype('float32')
    faiss.normalize_L2(q_emb)
    scores, indices = index.search(q_emb, k)
    
    results = []
    for idx, score in zip(indices[0], scores[0]):
        results.append({
            'response': df.iloc[idx]['response'],
            'similarity': float(score)
        })
    return results

def rag_answer(query, k=2, max_new_tokens=150):
    """Generate answer using RAG"""
    
    # Retrieve
    retrieved = retrieve_recipes(query, k=k)
    
    # Build context
    context = "Similar recipes:\n"
    for i, rec in enumerate(retrieved, 1):
        context += f"{i}. {rec['response']}\n"
    
    # Create prompt
    prompt = f"""The following is a conversation between a user and a helpful cooking assistant.

{context}

User: {query}
Assistant:"""
    
    # Tokenize and generate
    inputs = tokenizer(
        prompt,
        return_tensors='pt',
        max_length=1024,
        truncation=True,
        padding=True
    ).to(device)
    
    with torch.no_grad():
        outputs = conversational_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract answer
    if "Answer:" in response:
        answer = response.split("Answer:")[-1].strip()
    else:
        answer = response
    
    return answer

Test the pipeline

In [None]:
query = "I have chicken and rice, what can I make?"
print(f"\nQuery: {query}\n")
answer = rag_answer(query, k=3)
print(f"\nAnswer: {answer}")