# Fine-Tuned vs RAG: Linux Command Knowledge Assistant (RHEL 9)

## Overview
This project compares two approaches for building a Linux terminal assistant focused on Red Hat Enterprise Linux 9 (RHCSA/RHCE) commands:

1. Retrieval-Augmented Generation (RAG) using:
   - LangChain
   - HuggingFace embeddings (`sentence-transformers/all-MiniLM-L6-v2`)
   - Chroma vector database

2. Fine-tuning an OpenAI model (`gpt-4o-mini-2024-07-18`) on the same dataset of Linux commands and descriptions from Kaggle.

Both pipelines used the same Kaggle dataset (`cyberprince/linux-terminal-commands-dataset`), containing 599 short command–description pairs.  
No text chunking was applied, as each record was already short.

---

## RAG Portion

In [None]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from huggingface_hub import login
from openai import OpenAI

In [None]:
from datasets import load_dataset
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain_openai import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [None]:
# load environment values
load_dotenv(override=True)

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

In [None]:
MODEL = "gpt-4o-mini"

import time
db_name = f"vector_db_{int(time.time())}"
print(f"Using database: {db_name}")

In [None]:
pip install kagglehub

In [None]:
# download kaggle dataset
import kagglehub
from pathlib import Path

path = Path(kagglehub.dataset_download("cyberprince/linux-terminal-commands-dataset"))

print("Path to dataset files:", path)

# find the jsonl file automatically
json_files = list(path.glob("**/*.jsonl"))
if not json_files:
    raise FileNotFoundError(f"No .jsonl file found under {path}")
dataset_path = json_files[0]
print("Using dataset file:", dataset_path)

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [None]:
print(os.listdir(path))

In [None]:
# load jsonl dataset into pandas dataframe 

import json
import pandas as pd
from datasets import Dataset


# load jsonl file line by line
data = []
with open(dataset_path, "r", encoding="utf-8") as f:
    for line in f:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            continue
        
# convert to dataframe
df = pd.DataFrame(data)      

print(df.iloc[0])

#keep only the columns needed and rename them
df = df.rename(columns={"command": "instruction", "description": "output"})

df = df[["instruction", "output"]]

#convert to hf dataset
dataset = Dataset.from_pandas(df)

print(dataset[0])

In [None]:
# create langchain documents for vectorization

text_column = "instruction"
answer_column = "output"

def add_metadata(record):
    metadata = {k: v for k, v in record.items() if k not in [text_column, answer_column]}
    metadata["doc_type"] = "linux_commands"
    metadata["instruction"] = record[text_column]
    metadata["answer"] = record[answer_column]
    content = f"Instruction: {record[text_column]}\nAnswer: {record[answer_column]}"
    return Document(page_content=content, metadata=metadata)

#convert each row into a langchain document
documents = [add_metadata(record) for record in dataset if record.get(text_column)]


print(f"Loaded {len(documents)} documents from dataset")

    

In [None]:
for i, doc in enumerate(documents[:5]):
    print(f"Document {i}: {len(doc.page_content)} chars")
    print(f"Content preview: {doc.page_content[:150]}...")
    print()

In [None]:
# embed and store vectores in chroma db
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

#clear old db if exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name).delete_collection()


vectorstore = Chroma.from_documents(
    documents=documents, 
    embedding=embeddings, 
    persist_directory=db_name
)


print(f"Vectorstore created with {vectorstore._collection.count()} documents")



In [None]:
#visualize embeddings 

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
# visualize in 2d using tsne
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]

unique_doc_types = list(set(doc_types))
cmap = plt.get_cmap("tab10", len(unique_doc_types))
color_map = {t: cmap(i) for i, t in enumerate(unique_doc_types)}
colors = [color_map[t] for t in doc_types]

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

plt.figure(figsize=(8,6))
plt.scatter(
    reduced_vectors[:, 0],
    reduced_vectors[:,1],
    c=colors,
    s=20,
    alpha=0.8
)

plt.title("2D visualization of vector embeddings")
plt.xlabel("TSNE 1")
plt.ylabel("TSNE 2")
plt.show()

In [None]:
# build RAG chatbot
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

memory = ConversationBufferMemory(
    memory_key='chat_history',
    return_messages=True,
    output_key='answer'
)


retriever = vectorstore.as_retriever()

# conversational RAG chain
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True #show what does are retrieved
)


In [None]:
print("✓ Conversation chain created successfully")

# Now test it
print("\n" + "="*60)
print("Testing RAG...")
print("="*60)

# Test with a command we KNOW exists in the dataset
test_question = "What does the 'cd' command do?"
result = conversation_chain.invoke({"question": test_question})
print(f"\nQuestion: {test_question}")
print(f"Answer: {result['answer']}")
print(f"\nSource documents retrieved: {len(result['source_documents'])}")
for i, doc in enumerate(result['source_documents']):
    print(f"  Source {i+1}: {doc.metadata.get('instruction', 'N/A')}")

# Now test with 'man' (which probably doesn't exist)
test_question = "What does the 'man' command do?"
result = conversation_chain.invoke({"question": test_question})
print(f"\nQuestion: {test_question}")
print(f"Answer: {result['answer']}")
print(f"\nSource documents retrieved: {len(result['source_documents'])}")
for i, doc in enumerate(result['source_documents']):
    print(f"  Source {i+1}: {doc.metadata.get('instruction', 'N/A')}")

In [None]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [None]:
# Test with a command that exists
result = conversation_chain.invoke({"question": "What does the 'cd' command do?"})
print(result["answer"])

In [None]:
# launch gradio UI
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

## Fine Tune Portion

In [None]:
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

In [None]:
# split dataset
from datasets import DatasetDict

dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

In [None]:
def messages_for(query):
    system_message = (
        "You are a Linux terminal assistant and Red Hat certification expert for RHEL 9 "
        "(RHCSA and RHCE). You always give accurate, concise, and command-focused explanations "
        "based on the official RHEL 9 documentation. If a Linux command is asked, explain its purpose, "
        "syntax, options, and an example of real-world use."
    )
    user_prompt = (
        f"Question: {query}\n\n"
        "Please answer based on Red Hat Enterprise Linux 9 standards and certification expectations."
    )
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
    ]

In [None]:
# convert to jsonl for finetuning
import json

def make_jsonl(dataset, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for example in dataset:
            messages =[
                {"role": "system", "content": "You are a Linux terminal assistant."},
                {"role": "user", "content": example["instruction"]},
                {"role": "assistant", "content": example["output"]}
            ]
            f.write(json.dumps({"messages": messages}) + "\n")

make_jsonl(train_dataset, "train.jsonl")
make_jsonl(val_dataset, "val.jsonl")
            

In [None]:
openai = OpenAI()

In [None]:
# fine tuning 


train_file = openai.files.create(file=open("train.jsonl", "rb"), purpose="fine-tune")
val_file = openai.files.create(file=open("val.jsonl", "rb"), purpose="fine-tune")

job = openai.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={"n_epochs": 2},
    suffix="linux-commands"
)
print("Fine-tuning job:", job.id)

In [None]:
# this retrieves the fine tuned job id
job_id = job.id

result = openai.fine_tuning.jobs.retrieve(job_id)

#define separate varibale for finetuned model
MODEL_FINETUNED = result.fine_tuned_model
print("Fine-tuned model ID:", MODEL_FINETUNED)

In [None]:
# Compare RAG vs Fine-Tuned
query = "What does the 'man' command do?"

#RAG based response 
rag_result = conversation_chain.invoke({"question": query})
rag_response = rag_result["answer"]

messages = messages_for(query)


#Fine-tuned model
fine_tuned_response = openai.chat.completions.create(
    model=MODEL_FINETUNED,
    messages=messages
)

print("\n Base Model RAG Response:\n", rag_response)
print("\n Fine-tuned Model Response:\n", fine_tuned_response.choices[0].message.content)

In [None]:
# Compare RAG vs Fine-Tuned
query = "What does the 'cd' command do?"

#RAG based response 
rag_result = conversation_chain.invoke({"question": query})
rag_response = rag_result["answer"]

messages = messages_for(query)


#Fine-tuned model
fine_tuned_response = openai.chat.completions.create(
    model=MODEL_FINETUNED,
    messages=messages
)

print("\n Base Model RAG Response:\n", rag_response)
print("\n Fine-tuned Model Response:\n", fine_tuned_response.choices[0].message.content)

## Conclusion

Both models used the same dataset but behaved differently because of how each method leverages data.

The **RAG system** could only respond to commands explicitly stored in its vector database.  
Because the `man` command was not in the dataset, it correctly answered *“I don’t know.”*

The **fine-tuned model**, on the other hand, generalized from similar patterns and produced a correct, human-like answer for `man` even though it never saw that example during training.

This demonstrates that:

- **Fine-tuning** can generalize beyond the specific examples in the dataset.  
- **RAG** strictly depends on dataset completeness and coverage.
