# RAG Linux Command Assistant + Automated Evaluation

A follow-up to Project 9 : RAG and Fine-Tuning Comparison on a Linux Commands Dataset, this RAG chatbot uses Ollama's `llama3.2` and three Linux command datasets. Using `PromptTemplate` for structured prompts, it delivers accurate, relevant, and detailed command-line answers. Includes an automated evaluation suite to test responses with flexible scoring.


In [None]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
import re

In [None]:
#load environment variables
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

In [None]:
from datasets import load_dataset
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
import numpy as np
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_community.llms import Ollama



In [None]:
#ollama api
OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.2"
db_name = "RAG_vector_db"

In [None]:
!ollama pull llama3.2

In [None]:
# config 
#datasets can have different splits(train/test/validation), column names  for instruction and answer fields
# putting this in a dictionary list allows to loop over multiple datasets and treat them in a unified way
DATASETS = [
    {
        "name": "harpomaxx/unix-commands",
        "split": "train",
        "instruction_col": "instruction",
        "output_col": "output",
        "tag": "unix_commands"
    },
    {
        "name": "shikhardadhich/linux_commands",
        "split": "train",
        "instruction_col": "Instruction",
        "output_col": "Output",
        "tag": "linux_commands"
        
    },
    {
        "name": "mecha-org/linux-command-dataset",
        "split": "train",
        "instruction_col": "input",
        "output_col": "output",
        "tag": "linux_cmd_mecha"
    },
]


In [None]:

# load, normalize and tag

all_documents = []

#loop through each dataset config
for ds in DATASETS:
    dataset = load_dataset(ds["name"], split=ds["split"])
    #read instruction/output
    text_col = ds["instruction_col"]
    answer_col = ds["output_col"]

    #convert each row of the dataset into a langchain document
    def add_metadata(record):
        #extract all fields except instruction + answer
        metadata = {k: v for k, v in record.items() if k not in [text_col, answer_col]}
        # add custom metadata field to identify source dataset
        metadata["doc_type"] = ds["tag"]

        #create unified text format -merges instruction and answer together (this will be mbedded for RAG)
        content = f"Instruction: {record[text_col]}\nAnswer: {record[answer_col]}"
        return Document(page_content=content, metadata=metadata)
       
    #convert every record into a document
    docs = [add_metadata(r) for r in dataset if r.get(text_col)]
    #add processed documents for this dataset into the global list
    all_documents.extend(docs)
        
        
print(f"Loaded {len(all_documents)} combined documents")   

In [None]:
# debug and verify column names
ds1 = load_dataset("harpomaxx/unix-commands", split="train")
ds2 = load_dataset("shikhardadhich/linux_commands", split="train")
ds3 = load_dataset("mecha-org/linux-command-dataset", split="train")

print(ds1.column_names)
print(ds2.column_names)
print(ds3.column_names)

In [None]:
def normalize_and_deduplicate(docs):
    seen = set()
    clean_docs = []

    for doc in docs:
        #strip whitespace
        lines = [line.strip() for line in doc.page_content.splitlines()]
        content = "\n".join(lines)
        #collapse multiple spaces into one space 
        content = " ".join(content.split())

        #deduplicate
        if content not in seen:
            seen.add(content)
            #create a new document with the same metadata
            clean_docs.append(Document(page_content=content, metadata=doc.metadata))
    
    return clean_docs        

#apply normalization and deduplication
all_documents_clean = normalize_and_deduplicate(all_documents)
print(f"Documents after normalization and deduplication: {len(all_documents_clean)}")

In [None]:
# chunking
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50
)
chunks = text_splitter.split_documents(all_documents_clean)

print(f"Total chunks created: {len(chunks)}")

In [None]:
# check chunk character size in 3 first chunks
for i, chunk in enumerate(chunks[:3]):
    print(f"Chunk {i+1} length: {len(chunk.page_content)} chars")
    print(chunk.page_content)
    print(chunk.metadata)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

if os.path.exists(db_name):
    Chroma(persist_directory=db_name).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)


In [None]:
collection = vectorstore._collection
count = collection.count()
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)

print(f"Vectorstore created with {count:,} documents")
print(f"Each embedding has {dimensions:,} dimensions")

In [None]:
from langchain.prompts import PromptTemplate

system_prompt = """
You are a precise, highly knowledgeable Linux command assistant.
Always answer with:
- The most complete and standard Linux command, not minimal versions.
- a brief explanation.
- an example when useful.
- use the recursive version if applicable.
- never shorten commands.


Use the retrieved context below if relevant.

Context:
{context}

Question
{question}
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=system_prompt
)

In [None]:
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm = Ollama(temperature=0.7, model=MODEL),
    retriever = retriever,
    memory = memory,
    combine_docs_chain_kwargs={"prompt": prompt}
)

In [None]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [None]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

In [None]:
#Testing RAG repsonses

# --- 1. Define test cases --------------------------------------------------

test_cases = [
    {
        "question": "How do I list all files in a directory including hidden ones?",
        "expected_keywords": ["ls -a", "ls -la", "hidden"],
    },
    {
        "question": "How do I find a file containing a specific text string?",
        "expected_keywords": ["grep", "grep -r", "string", "filename"],
    },
    {
        "question": "Explain the difference between grep and sed.",
        "expected_keywords": ["grep", "sed", "search", "edit", "replace"],
    },
    {
        "question": "How do I search for a file by name?",
        "expected_keywords": ["find", "-name"],
    },
    {
        "question": "And how about searching inside files?",
        "expected_keywords": ["grep", "string", "file"],
    }
]

# --- 2. Loosened evaluation function ---------------------------------------

def score_answer(answer, expected_keywords, threshold=0.6):
    """
    Loosened evaluation:
    - Partial score for each keyword found
    - Pass if ratio >= threshold
    """
    answer_lower = answer.lower()
    score = 0
    for kw in expected_keywords:
        # simple substring OR regex for variations
        pattern = re.escape(kw.lower())
        if re.search(pattern, answer_lower):
            score += 1

    total = len(expected_keywords)
    ratio = score / total
    passed = ratio >= threshold  # pass if >= threshold (60% by default)
    return score, total, passed

# --- 3. Run evaluation ------------------------------------------------------

results = []

for test in test_cases:
    q = test["question"]
    expected = test["expected_keywords"]

    response = chat(q, history=[])  # no conversation memory
    score, total, passed = score_answer(response, expected)

    results.append({
        "question": q,
        "response": response,
        "score": score,
        "total": total,
        "passed": passed
    })

# --- 4. Print report --------------------------------------------------------

for result in results:
    print("\n--------------------------------------------------")
    print("Question:", result["question"])
    print("Model Response:", result["response"])
    print(f"Score: {result['score']} / {result['total']}")
    print("Status:", " PASS" if result["passed"] else " FAIL")
