In [5]:
# Install required packages (if not installed)
!pip install chromadb pandas sentence-transformers transformers

Collecting chromadb
  Downloading chromadb-1.0.16-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Downloading opentelemetry_sdk-1.36.0-py3-none-any.whl.metadata (1.5 k

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, f1_score
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import AutoModelForSeq2SeqLM, pipeline

In [2]:
from sklearn.metrics import precision_score, recall_score

# Load full data
filepath = '/content/downsampled_chitchat_dataset.csv'
df = pd.read_csv(filepath)

# 🟨 Balance the dataset using upsampling
max_samples = df['intent'].value_counts().max()

balanced_df = pd.concat([
    resample(df[df['intent'] == label],
             replace=True,
             n_samples=max_samples,
             random_state=42)
    for label in df['intent'].unique()
])

# Optional: Shuffle to mix oversampled data
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Ensure 'message' column is of type string
balanced_df['message'] = balanced_df['message'].fillna('').astype(str)

# Label mapping (sorted to ensure consistency)
unique_intents = sorted(balanced_df['intent'].unique())
label_map = {intent: i for i, intent in enumerate(unique_intents)}
number_of_labels = len(label_map)

# Encode labels
balanced_df['labels'] = balanced_df['intent'].map(label_map)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(balanced_df)

# Tokenizer
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize text
def tokenize_function(examples):
    return tokenizer(examples["message"], truncation=True, padding="max_length")

dataset = dataset.map(tokenize_function, batched=True)

# Split into train and validation
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=number_of_labels
)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted")
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", # Corrected from evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.1,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/28000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0654,0.054333,0.991071,0.991023,0.991172,0.991071
2,0.0073,0.0212,0.9975,0.99749,0.9975,0.9975
3,0.0,0.015092,0.998393,0.99839,0.998403,0.998393


TrainOutput(global_step=16800, training_loss=0.0876133339497305, metrics={'train_runtime': 1959.5128, 'train_samples_per_second': 34.294, 'train_steps_per_second': 8.574, 'total_flos': 8903714203238400.0, 'train_loss': 0.0876133339497305, 'epoch': 3.0})

In [3]:
# Save the model
model_save_path = "./shona_chatbot_model"
trainer.save_model(model_save_path)

In [5]:
# === Imports ===
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification

# ===============================
# 1. SETUP DATA & BASELINE MODELS
# ===============================

# Load CSV Data for RAG
filepath = '/content/pace_graduate_programs (1).csv'  # Replace with your file path
df = pd.read_csv(filepath)
program_names = df['Program Name'].tolist()
program_links = df['Program Link'].tolist()

# Embed Program Names
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(program_names)

# Initialize Chroma Vector Store
chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
collection = chroma_client.get_or_create_collection(name="pace_programs")

# Add data if collection is empty
if collection.count() == 0:
    collection.add(
        documents=program_names,
        metadatas=[{"link": link} for link in program_links],
        embeddings=embeddings.tolist(),
        ids=[f"id{i}" for i in range(len(program_names))]
    )

# Semantic Search
def search_programs(user_query, top_k=5):
    query_embed = embedding_model.encode([user_query])[0].tolist()
    results = collection.query(query_embeddings=[query_embed], n_results=top_k)
    programs = results['documents'][0]
    metadata = results['metadatas'][0]
    return list(zip(programs, [meta['link'] for meta in metadata]))

# Baseline Flan-T5 Model
baseline_model_name = "google/flan-t5-small"
baseline_tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)
baseline_model = AutoModelForSeq2SeqLM.from_pretrained(baseline_model_name)
baseline_pipeline = pipeline("text2text-generation", model=baseline_model, tokenizer=baseline_tokenizer)

# ===============================
# 2. SETUP SHONA SLANG CLASSIFIER
# ===============================

shona_model_path = "/content/shona_chatbot_model"  # Your fine-tuned model path
shona_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
shona_model = AutoModelForSequenceClassification.from_pretrained(shona_model_path)
shona_classifier = pipeline("text-classification", model=shona_model, tokenizer=shona_tokenizer, return_all_scores=True)

# Load label map
unique_intents = sorted(pd.read_csv('/content/downsampled_chitchat_dataset.csv')['intent'].unique())
label_map = {intent: i for i, intent in enumerate(unique_intents)}
id2label = {v: k for k, v in label_map.items()}

# ===============================
# 3. CLASSIFICATION FUNCTIONS
# ===============================

def classify_intent(text):
    result = shona_classifier(text)
    pred = max(result[0], key=lambda x: x["score"])
    label_index = int(pred["label"].replace("LABEL_", ""))
    intent_name = id2label[label_index]
    return intent_name, pred["score"]

# ===============================
# 4. RULE-BASED HANDLER FOR SHONA BOT
# ===============================

def handle_intent_response(intent, user_input):
    intent = intent.lower()
    user_input_lower = user_input.lower()

    if intent == "greeting":
        return "Hesi shamwari! Uri sei hako?"

    elif intent == "finance":
        return "Bata Finance Department panhamba idzi: 646-479-3688"

    elif intent == "religion":
        return "Bata Department reReligion pa: www.religion@pace.edu"

    elif intent == "program_inquiry":
        return "Unoda program ipi? Ndine runyorwa rwezviripo."

    elif any(word in user_input_lower for word in ["apply", "nyora", "kunyora", "register", "application"]):
        name = input("ShonaBot: Ndokumbirawo zita rako rizere: ")
        education = input("ShonaBot: Wakadzidza kupi kare? (e.g., BSc, diploma, etc.): ")
        email = input("ShonaBot: Email yako ndeipi?: ")
        print(f"\n[INFO] Application received:\nName: {name}\nEducation: {education}\nEmail: {email}")
        return "Waita apply! Chikumbiro chako chakatumirwa kuchikoro. Tichakutumira zvimwe ruzivo 💼"

    return None

# ===============================
# 5. RAG + FLAN-T5 GENERATION
# ===============================

def generate_answer_with_rag(query):
    intent, confidence = classify_intent(query)
    programs = search_programs(query)
    context = "\n".join([f"{name} – {link}" for name, link in programs])
    prompt = f"""You are a helpful assistant that answers questions about graduate programs at Pace University.
The user's intent is: {intent} (confidence: {confidence:.2f}).

Programs:
{context}

Question: {query}

Answer:"""
    answer = baseline_pipeline(prompt, max_new_tokens=150)[0]['generated_text']
    return intent, confidence, answer

# ===============================
# 6. EXIT CHECK
# ===============================

def is_exit(text):
    return text.strip().lower() in ["exit", "quit", "bye"]

# ===============================
# 7. INTERACTIVE CHAT LOOP
# ===============================

print("💬 Welcome! Type your question in Shona slang or English. (Type 'exit' to quit)\n")

while True:
    user_input = input("Iwe: ")

    if is_exit(user_input):
        print("PaceBot: Zvakanaka, tichaonana zvakare! 🎓")
        break

    # Shona chatbot
    intent, confidence = classify_intent(user_input)
    special_response = handle_intent_response(intent, user_input)

    print(f"\n📌 Detected Intent: {intent} (confidence {confidence:.2f})")

    if special_response is not None:
        print(f"🤖 ShonaBot: {special_response}")
    else:
        print("🤖 ShonaBot: Ndine urombo, handina mhinduro yakajeka pane izvozvo.")

    # Baseline RAG
    intent_rag, conf_rag, rag_answer = generate_answer_with_rag(user_input)
    print(f"🤖 Baseline RAG: {rag_answer}")
    print("-" * 100)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
Device set to use cuda:0


💬 Welcome! Type your question in Shona slang or English. (Type 'exit' to quit)

Iwe: madii mashefu

📌 Detected Intent: greeting (confidence 1.00)
🤖 ShonaBot: Hesi shamwari! Uri sei hako?
🤖 Baseline RAG: madii mashefu
----------------------------------------------------------------------------------------------------
Iwe: i mari pa Pace

📌 Detected Intent: finance (confidence 1.00)
🤖 ShonaBot: Bata Finance Department panhamba idzi: 646-479-3688
🤖 Baseline RAG: Pace University-Lenox Hill Hospital Physician Assistant Program, MS – https://www.pace.edu/program/pace-university-lenox-hill-hospital-physician-assistant-program-ms Nursing, Doctor of Nursing Practice, DNP – https://www.pace.edu/program/online-accelerated-doctor-of-nursing-practice-dnp Psychology, MA – https://www.pace.edu/program/psychology-ma Public Administration, MPA – https://www.pace.edu https://online.pace.edu
----------------------------------------------------------------------------------------------------
Iwe: mune ma 

In [6]:
# Zip the model for download
!zip -r shona_chatbot_model.zip ./shona_chatbot_model

  adding: shona_chatbot_model/ (stored 0%)
  adding: shona_chatbot_model/config.json (deflated 59%)
  adding: shona_chatbot_model/model.safetensors (deflated 7%)
  adding: shona_chatbot_model/training_args.bin (deflated 51%)


In [8]:
# === Install packages ===
!pip install evaluate bert-score rouge-score sacrebleu transformers sentence-transformers

import pandas as pd
import evaluate
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
import torch

# =========================================
# 1. Load your models
# =========================================

# Baseline Flan-T5
baseline_model_name = "google/flan-t5-small"
baseline_tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)
baseline_model = AutoModelForSeq2SeqLM.from_pretrained(baseline_model_name)
baseline_pipeline = pipeline("text2text-generation", model=baseline_model, tokenizer=baseline_tokenizer)

# Shona model (Intent Classifier)
shona_model_path = "/content/shona_chatbot_model"  # your fine-tuned path
shona_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
shona_model = AutoModelForSequenceClassification.from_pretrained(shona_model_path)
shona_classifier = pipeline("text-classification", model=shona_model, tokenizer=shona_tokenizer, return_all_scores=True)

# Label mapping for Shona model
unique_intents = sorted(pd.read_csv('/content/downsampled_chitchat_dataset.csv')['intent'].unique())
id2label = {i: intent for i, intent in enumerate(unique_intents)}

# =========================================
# 2. Test set (Shona slang + English)
# =========================================
# Ideally, load from a CSV of test queries and reference responses
# Format: query | reference
test_data = [
    {"query": "Mauya sei?", "reference": "Ndinofara kukuona, hesi!"},
    {"query": "Ndiri kuda kunyora kuapply kuchikoro", "reference": "Waita apply! Chikumbiro chako chakatumirwa."},
    {"query": "What graduate programs are available?", "reference": "We offer multiple graduate programs in business, computer science, and more."},
    {"query": "Ndeipi Finance Department contact?", "reference": "Bata Finance Department panhamba idzi: 646-479-3688"},
]

df_eval = pd.DataFrame(test_data)

# =========================================
# 3. Evaluation helpers
# =========================================
def classify_intent(text):
    result = shona_classifier(text)
    pred = max(result[0], key=lambda x: x["score"])
    label_index = int(pred["label"].replace("LABEL_", ""))
    return id2label[label_index], pred["score"]

def generate_baseline(query):
    prompt = f"Answer this question about Pace University graduate programs: {query}"
    return baseline_pipeline(prompt, max_new_tokens=150)[0]['generated_text']

def generate_shona(query):
    intent, conf = classify_intent(query)
    # Simple rule-based responses for evaluation
    if intent == "greeting":
        return "Hesi shamwari! Uri sei hako?"
    elif intent == "finance":
        return "Bata Finance Department panhamba idzi: 646-479-3688"
    elif intent == "program_inquiry":
        return "Unoda program ipi? Ndine runyorwa rwezviripo."
    elif any(word in query.lower() for word in ["apply", "nyora", "kunyora", "register", "application"]):
        return "Waita apply! Chikumbiro chako chakatumirwa kuchikoro."
    else:
        return "Ndine urombo, handina mhinduro yakajeka pane izvozvo."

# =========================================
# 4. Run evaluation
# =========================================
baseline_responses = []
shona_responses = []
pred_intents = []

for q in df_eval["query"]:
    shona_responses.append(generate_shona(q))
    baseline_responses.append(generate_baseline(q))
    intent, _ = classify_intent(q)
    pred_intents.append(intent)

df_eval["shona_response"] = shona_responses
df_eval["baseline_response"] = baseline_responses
df_eval["pred_intent"] = pred_intents

# =========================================
# 5. Automatic metrics
# =========================================
# BLEU
bleu = evaluate.load("sacrebleu")
df_eval["bleu_shona"] = [bleu.compute(predictions=[pred], references=[[ref]])["score"]
                         for pred, ref in zip(df_eval["shona_response"], df_eval["reference"])]
df_eval["bleu_baseline"] = [bleu.compute(predictions=[pred], references=[[ref]])["score"]
                            for pred, ref in zip(df_eval["baseline_response"], df_eval["reference"])]

# ROUGE-L
rouge = evaluate.load("rouge")
df_eval["rougeL_shona"] = [rouge.compute(predictions=[pred], references=[ref])["rougeL"]
                           for pred, ref in zip(df_eval["shona_response"], df_eval["reference"])]
df_eval["rougeL_baseline"] = [rouge.compute(predictions=[pred], references=[ref])["rougeL"]
                              for pred, ref in zip(df_eval["baseline_response"], df_eval["reference"])]

# BERTScore
bertscore = evaluate.load("bertscore")
bs_shona = bertscore.compute(predictions=df_eval["shona_response"].tolist(),
                             references=df_eval["reference"].tolist(), lang="en")
bs_baseline = bertscore.compute(predictions=df_eval["baseline_response"].tolist(),
                                references=df_eval["reference"].tolist(), lang="en")
df_eval["bertscore_shona"] = bs_shona["f1"]
df_eval["bertscore_baseline"] = bs_baseline["f1"]

# =========================================
# 6. Save & display results
# =========================================
df_eval.to_csv("model_comparison_results.csv", index=False)
print(df_eval)


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

Device set to use cuda:0
Device set to use cuda:0


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                   query  \
0                             Mauya sei?   
1   Ndiri kuda kunyora kuapply kuchikoro   
2  What graduate programs are available?   
3     Ndeipi Finance Department contact?   

                                           reference  \
0                           Ndinofara kukuona, hesi!   
1        Waita apply! Chikumbiro chako chakatumirwa.   
2  We offer multiple graduate programs in busines...   
3  Bata Finance Department panhamba idzi: 646-479...   

                                      shona_response  \
0  Ndine urombo, handina mhinduro yakajeka pane i...   
1  Waita apply! Chikumbiro chako chakatumirwa kuc...   
2  Ndine urombo, handina mhinduro yakajeka pane i...   
3  Ndine urombo, handina mhinduro yakajeka pane i...   

                      baseline_response pred_intent  bleu_shona  \
0                       Pace University    chitchat    4.767707   
1  ndiri kuda kunyora kuapply kuchikoro    chitchat   70.710678   
2               

In [None]:
from transformers import pipeline

# Create a pipeline for text classification
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True
)

# Create a reverse mapping from ID to label
id2label = {v: k for k, v in label_map.items()}

# Mapping of intents to responses
intent_to_response = {
    "greeting": "Mhoro! Unoda rubatsiro nei nhasi?",
    "religion": "Iwewe what do you believe?",
    "chit chat": "Welcome to the slang chatbot, ask your question or opinion.",
    "sport": "Mabhora anouraya aya kkk!",
    "deadline": "Deadline yeapplication ndeye November 30, 2025.",
    # Add more intents and responses as needed
}

# Function to classify input text and return intent name
def classify_intent(text):
    result = classifier(text)
    pred = max(result[0], key=lambda x: x["score"])

    # Extract label index from 'LABEL_3', etc.
    label_index = int(pred["label"].replace("LABEL_", ""))

    # Map back to actual intent name
    intent_name = id2label[label_index]

    return intent_name, pred["score"]

# Function to get chatbot response from input
def chatbot_response(user_input):
    intent, confidence = classify_intent(user_input)
    response = intent_to_response.get(
        intent,
        "Ndine urombo, handina kunzwisisa. Unogona kudzokorora here?"
    )
    return f"[{intent} | confidence: {confidence:.2f}] {response}"

# Start chat loop
print("Shona Chatbot 🤖: Mhoro! Ndibvunzei zvamunoda (type 'exit' to quit)")

while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        print("Chatbot: Zvakanaka, tichaonana zvakare!")
        break

    response = chatbot_response(user_input)
    print("Chatbot:", response)

Device set to use cuda:0


Shona Chatbot 🤖: Mhoro! Ndibvunzei zvamunoda (type 'exit' to quit)
You: makadii henyu
Chatbot: [chitchat | confidence: 1.00] Ndine urombo, handina kunzwisisa. Unogona kudzokorora here?
You: mari yangu yakwana here
Chatbot: [finance | confidence: 1.00] Ndine urombo, handina kunzwisisa. Unogona kudzokorora here?
You: mufundisi ariko here
Chatbot: [religion | confidence: 1.00] Iwewe what do you believe?
You: ndokuda babe wangu
Chatbot: [romance | confidence: 1.00] Ndine urombo, handina kunzwisisa. Unogona kudzokorora here?
You: wadii shamwari
Chatbot: [greeting | confidence: 1.00] Mhoro! Unoda rubatsiro nei nhasi?
You: exit
Chatbot: Zvakanaka, tichaonana zvakare!


In [6]:
# === Imports ===
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForSequenceClassification

# === Load CSV Data ===
filepath = '/content/pace_graduate_programs (1).csv'  # Upload your file here
df = pd.read_csv(filepath)

program_names = df['Program Name'].tolist()
program_links = df['Program Link'].tolist()

# === Embed Program Names ===
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(program_names)

# === Initialize Chroma Vector Store ===
chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))

# Get or create the collection
collection = chroma_client.get_or_create_collection(name="pace_programs")

# Check if the collection is empty before adding data
if collection.count() == 0:
    collection.add(
        documents=program_names,
        metadatas=[{"link": link} for link in program_links],
        embeddings=embeddings.tolist(),
        ids=[f"id{i}" for i in range(len(program_names))]
    )

# === Semantic Search ===
def search_programs(user_query, top_k=5):
    query_embed = embedding_model.encode([user_query])[0].tolist()
    results = collection.query(
        query_embeddings=[query_embed],
        n_results=top_k
    )
    programs = results['documents'][0]
    metadata = results['metadatas'][0]
    return list(zip(programs, [meta['link'] for meta in metadata]))

# === Baseline Model (Flan-T5) for Text Generation ===
baseline_model_name = "google/flan-t5-small"
baseline_tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)
baseline_model = AutoModelForSeq2SeqLM.from_pretrained(baseline_model_name)
baseline_pipeline = pipeline("text2text-generation", model=baseline_model, tokenizer=baseline_tokenizer)

# === Shona Slang-Aware Model for Intent Classification ===
shona_model_path = "/content/shona_chatbot_model"  # Your fine-tuned model path
# Load tokenizer and model for sequence classification
shona_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
shona_model = AutoModelForSequenceClassification.from_pretrained(shona_model_path)
shona_classifier = pipeline(
    "text-classification",
    model=shona_model,
    tokenizer=shona_tokenizer,
    return_all_scores=True
)

# Create a reverse mapping from ID to label for the fine-tuned model
# This assumes you have the label_map from your training script available
# If not, you might need to load it or recreate it based on your training data
# For this example, let's assume the label_map is available in the environment
# If not, you would need to add code to load or define it.
# Example: id2label = {0: "greeting", 1: "religion", ...}
# For demonstration, we'll use a placeholder. Replace with your actual label_map.
# If label_map is not available, you might need to run the training cell first
# or load the label mapping from where you saved it.
# Assuming label_map is available from previous execution:
unique_intents = sorted(pd.read_csv('/content/downsampled_chitchat_dataset.csv')['intent'].unique())
label_map = {intent: i for i, intent in enumerate(unique_intents)}
id2label = {v: k for k, v in label_map.items()}


def classify_intent_shona(text):
    result = shona_classifier(text)
    pred = max(result[0], key=lambda x: x["score"])
    label_index = int(pred["label"].replace("LABEL_", ""))
    intent_name = id2label[label_index]
    return intent_name, pred["score"]


# === RAG Answer Generator ===
def generate_answer_with_models(query, program_tuples):
    # Classify intent using the Shona model
    intent, confidence = classify_intent_shona(query)

    context = "\n".join([f"{name} – {link}" for name, link in program_tuples])

    # Incorporate intent into the prompt for the generation model
    prompt = f"""You are a helpful assistant that answers questions about graduate programs at Pace University.
The user's intent seems to be about: {intent} (confidence: {confidence:.2f}).

Programs:
{context}

Question: {query}

Answer:"""
    return baseline_pipeline(prompt, max_new_tokens=150)[0]['generated_text']

# === Comparison Test Queries ===
test_queries = [
    "Pane here MSc muData Science?",
    "Ndinga apply sei kuPACE?",
    "Pane finance department here?",
    "Hi, what graduate programs do you have?",
    "Chii chinonzi MBA?"
]

print("\n=== RAG with Shona Intent Classification and Flan-T5 Generation ===\n")
for query in test_queries:
    programs = search_programs(query)

    rag_answer = generate_answer_with_models(query, programs)

    print(f"📝 Query: {query}")
    print(f"🤖 RAG Answer: {rag_answer}")
    print("-" * 80)

Device set to use cuda:0
Device set to use cuda:0



=== RAG with Shona Intent Classification and Flan-T5 Generation ===

📝 Query: Pane here MSc muData Science?
🤖 RAG Answer: Computer Science, MS – https://www.pace.eduhttps://online.pace.edu/graduate-programs/ms-in-data-science/ Data Science, MS – https://www.pace.edu/graduate-programs/ms-in-computer-science/ Computer Science, MS – https://www.pace.edu/graduate-programs/ms-in-computer-science/ Computer Science, MS – https://www.pace.edu/graduate-programs/ms-in-computer-science/ Computer Science, MS – https://www.pace.edu/
--------------------------------------------------------------------------------
📝 Query: Ndinga apply sei kuPACE?
🤖 RAG Answer: Ndinga apply sei kuPACE?
--------------------------------------------------------------------------------
📝 Query: Pane finance department here?
🤖 RAG Answer: pace.edu
--------------------------------------------------------------------------------
📝 Query: Hi, what graduate programs do you have?
🤖 RAG Answer: Business, General, MBA – https:/

NameError: name 'trainer' is not defined

In [8]:
# === Imports ===
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForSequenceClassification

# === Load CSV Data ===
filepath = '/content/pace_graduate_programs (1).csv'
df = pd.read_csv(filepath)

program_names = df['Program Name'].tolist()
program_links = df['Program Link'].tolist()

# === Embed Program Names ===
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(program_names)

# === Initialize Chroma Vector Store ===
chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
collection = chroma_client.get_or_create_collection(name="pace_programs")

if collection.count() == 0:
    collection.add(
        documents=program_names,
        metadatas=[{"link": link} for link in program_links],
        embeddings=embeddings.tolist(),
        ids=[f"id{i}" for i in range(len(program_names))]
    )

# === Semantic Search ===
def search_programs(user_query, top_k=5):
    query_embed = embedding_model.encode([user_query])[0].tolist()
    results = collection.query(
        query_embeddings=[query_embed],
        n_results=top_k
    )
    programs = results['documents'][0]
    metadata = results['metadatas'][0]
    return list(zip(programs, [meta['link'] for meta in metadata]))

# === Baseline Model (Flan-T5) ===
baseline_model_name = "google/flan-t5-small"
baseline_tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)
baseline_model = AutoModelForSeq2SeqLM.from_pretrained(baseline_model_name)
baseline_pipeline = pipeline("text2text-generation", model=baseline_model, tokenizer=baseline_tokenizer)

# === Shona Slang-Aware Model (Intent Classification) ===
shona_model_path = "/content/shona_chatbot_model"
shona_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
shona_model = AutoModelForSequenceClassification.from_pretrained(shona_model_path)
shona_classifier = pipeline("text-classification", model=shona_model, tokenizer=shona_tokenizer, return_all_scores=True)

# === Label Mapping from Dataset ===
unique_intents = sorted(pd.read_csv('/content/downsampled_chitchat_dataset.csv')['intent'].unique())
label_map = {intent: i for i, intent in enumerate(unique_intents)}
id2label = {v: k for k, v in label_map.items()}

def classify_intent_shona(text):
    result = shona_classifier(text)
    pred = max(result[0], key=lambda x: x["score"])
    label_index = int(pred["label"].replace("LABEL_", ""))
    intent_name = id2label[label_index]
    return intent_name, pred["score"]

# === Shona RAG Model (Flan-T5) ===
# We'll reuse Flan-T5 for generation but add Shona intent to prompt
def generate_shona_rag_answer(query, program_tuples):
    intent, confidence = classify_intent_shona(query)
    context = "\n".join([f"{name} – {link}" for name, link in program_tuples])
    prompt = f"""You are a helpful assistant that answers questions about graduate programs at Pace University.
The user's intent seems to be about: {intent} (confidence: {confidence:.2f}).

Programs:
{context}

Question: {query}

Answer:"""
    return baseline_pipeline(prompt, max_new_tokens=150)[0]['generated_text']

# === Baseline RAG Answer ===
def generate_baseline_rag_answer(query, program_tuples):
    context = "\n".join([f"{name} – {link}" for name, link in program_tuples])
    prompt = f"""You are a helpful assistant that answers questions about graduate programs at Pace University.

Programs:
{context}

Question: {query}

Answer:"""
    return baseline_pipeline(prompt, max_new_tokens=150)[0]['generated_text']

# === Live Chat Comparison ===
print("\n🎓 PaceBot Comparison Mode")
print("Type your question (type 'exit' to quit)\n")

while True:
    user_input = input("You: ")
    if user_input.strip().lower() in ['exit', 'quit']:
        print("PaceBot: Goodbye! 🎓")
        break

    # Search matching programs
    programs = search_programs(user_input)

    # Get both answers
    shona_answer = generate_shona_rag_answer(user_input, programs)
    baseline_answer = generate_baseline_rag_answer(user_input, programs)

    # Display side-by-side comparison
    print("\n🔹 Shona-Aware Model:")
    print(shona_answer)
    print("\n🔹 Baseline Flan-T5 Model:")
    print(baseline_answer)
    print("-" * 80)


Device set to use cuda:0
Device set to use cuda:0



🎓 PaceBot Comparison Mode
Type your question (type 'exit' to quit)

You: madii

🔹 Shona-Aware Model:
Pace University

🔹 Baseline Flan-T5 Model:
Pace University
--------------------------------------------------------------------------------
You: exit
PaceBot: Goodbye! 🎓


In [1]:
# === Imports ===
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification

# ===============================
# 1. SETUP DATA & BASELINE MODELS
# ===============================

# Load CSV Data for RAG
filepath = '/content/pace_graduate_programs (1).csv'  # Replace with your file path
df = pd.read_csv(filepath)
program_names = df['Program Name'].tolist()
program_links = df['Program Link'].tolist()

# Embed Program Names
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(program_names)

# Initialize Chroma Vector Store
chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
collection = chroma_client.get_or_create_collection(name="pace_programs")

# Add data if collection is empty
if collection.count() == 0:
    collection.add(
        documents=program_names,
        metadatas=[{"link": link} for link in program_links],
        embeddings=embeddings.tolist(),
        ids=[f"id{i}" for i in range(len(program_names))]
    )

# Semantic Search
def search_programs(user_query, top_k=5):
    query_embed = embedding_model.encode([user_query])[0].tolist()
    results = collection.query(query_embeddings=[query_embed], n_results=top_k)
    programs = results['documents'][0]
    metadata = results['metadatas'][0]
    return list(zip(programs, [meta['link'] for meta in metadata]))

# Baseline Flan-T5 Model
baseline_model_name = "google/flan-t5-small"
baseline_tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)
baseline_model = AutoModelForSeq2SeqLM.from_pretrained(baseline_model_name)
baseline_pipeline = pipeline("text2text-generation", model=baseline_model, tokenizer=baseline_tokenizer)

# ===============================
# 2. SETUP SHONA SLANG CLASSIFIER
# ===============================

shona_model_path = "/content/shona_chatbot_model"  # Your fine-tuned model path
shona_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
shona_model = AutoModelForSequenceClassification.from_pretrained(shona_model_path)
shona_classifier = pipeline("text-classification", model=shona_model, tokenizer=shona_tokenizer, return_all_scores=True)

# Load label map
unique_intents = sorted(pd.read_csv('/content/downsampled_chitchat_dataset.csv')['intent'].unique())
label_map = {intent: i for i, intent in enumerate(unique_intents)}
id2label = {v: k for k, v in label_map.items()}

# ===============================
# 3. CLASSIFICATION FUNCTIONS
# ===============================

def classify_intent(text):
    result = shona_classifier(text)
    pred = max(result[0], key=lambda x: x["score"])
    label_index = int(pred["label"].replace("LABEL_", ""))
    intent_name = id2label[label_index]
    return intent_name, pred["score"]

# ===============================
# 4. RULE-BASED HANDLER FOR SHONA BOT
# ===============================

def handle_intent_response(intent, user_input):
    intent = intent.lower()
    user_input_lower = user_input.lower()

    if intent == "greeting":
        return "Hesi shamwari! Uri sei hako?"

    elif intent == "finance":
        return "Bata Finance Department panhamba idzi: 646-479-3688"

    elif intent == "religion":
        return "Bata Department reReligion pa: www.religion@pace.edu"

    elif intent == "program_inquiry":
        return "Unoda program ipi? Ndine runyorwa rwezviripo."

    elif any(word in user_input_lower for word in ["apply", "nyora", "kunyora", "register", "application"]):
        name = input("ShonaBot: Ndokumbirawo zita rako rizere: ")
        education = input("ShonaBot: Wakadzidza kupi kare? (e.g., BSc, diploma, etc.): ")
        email = input("ShonaBot: Email yako ndeipi?: ")
        print(f"\n[INFO] Application received:\nName: {name}\nEducation: {education}\nEmail: {email}")
        return "Waita apply! Chikumbiro chako chakatumirwa kuchikoro. Tichakutumira zvimwe ruzivo 💼"

    return None

# ===============================
# 5. RAG + FLAN-T5 GENERATION
# ===============================

def generate_answer_with_rag(query):
    intent, confidence = classify_intent(query)
    programs = search_programs(query)
    context = "\n".join([f"{name} – {link}" for name, link in programs])
    prompt = f"""You are a helpful assistant that answers questions about graduate programs at Pace University.
The user's intent is: {intent} (confidence: {confidence:.2f}).

Programs:
{context}

Question: {query}

Answer:"""
    answer = baseline_pipeline(prompt, max_new_tokens=150)[0]['generated_text']
    return intent, confidence, answer

# ===============================
# 6. EXIT CHECK
# ===============================

def is_exit(text):
    return text.strip().lower() in ["exit", "quit", "bye"]

# ===============================
# 7. INTERACTIVE CHAT LOOP
# ===============================

print("💬 Welcome! Type your question in Shona slang or English. (Type 'exit' to quit)\n")

while True:
    user_input = input("Iwe: ")

    if is_exit(user_input):
        print("PaceBot: Zvakanaka, tichaonana zvakare! 🎓")
        break

    # Shona chatbot
    intent, confidence = classify_intent(user_input)
    special_response = handle_intent_response(intent, user_input)

    print(f"\n📌 Detected Intent: {intent} (confidence {confidence:.2f})")

    if special_response is not None:
        print(f"🤖 ShonaBot: {special_response}")
    else:
        print("🤖 ShonaBot: Ndine urombo, handina mhinduro yakajeka pane izvozvo.")

    # Baseline RAG
    intent_rag, conf_rag, rag_answer = generate_answer_with_rag(user_input)
    print(f"🤖 Baseline RAG: {rag_answer}")
    print("-" * 100)


ModuleNotFoundError: No module named 'chromadb'

In [10]:
# === Imports ===
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification

# ===============================
# 1. SETUP DATA & BASELINE MODELS
# ===============================

# === Load CSV Data for RAG ===
filepath = '/content/pace_graduate_programs (1).csv'  # Replace with your file path
df = pd.read_csv(filepath)
program_names = df['Program Name'].tolist()
program_links = df['Program Link'].tolist()

# === Embed Program Names ===
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(program_names)

# === Initialize Chroma Vector Store ===
chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
collection = chroma_client.get_or_create_collection(name="pace_programs")

# Add data if collection is empty
if collection.count() == 0:
    collection.add(
        documents=program_names,
        metadatas=[{"link": link} for link in program_links],
        embeddings=embeddings.tolist(),
        ids=[f"id{i}" for i in range(len(program_names))]
    )

# === Semantic Search Function ===
def search_programs(user_query, top_k=5):
    query_embed = embedding_model.encode([user_query])[0].tolist()
    results = collection.query(query_embeddings=[query_embed], n_results=top_k)
    programs = results['documents'][0]
    metadata = results['metadatas'][0]
    return list(zip(programs, [meta['link'] for meta in metadata]))

# === Baseline Flan-T5 Model for Answer Generation ===
baseline_model_name = "google/flan-t5-small"
baseline_tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)
baseline_model = AutoModelForSeq2SeqLM.from_pretrained(baseline_model_name)
baseline_pipeline = pipeline("text2text-generation", model=baseline_model, tokenizer=baseline_tokenizer)

# ===============================
# 2. SETUP SHONA SLANG CLASSIFIER
# ===============================

shona_model_path = "/content/shona_chatbot_model"  # Your fine-tuned model path
shona_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
shona_model = AutoModelForSequenceClassification.from_pretrained(shona_model_path)
shona_classifier = pipeline("text-classification", model=shona_model, tokenizer=shona_tokenizer, return_all_scores=True)

# Load label map from your training data
unique_intents = sorted(pd.read_csv('/content/downsampled_chitchat_dataset.csv')['intent'].unique())
label_map = {intent: i for i, intent in enumerate(unique_intents)}
id2label = {v: k for k, v in label_map.items()}

# Mapping intents to fixed responses (for first chatbot)
intent_to_response = {
    "greeting": "Mhoro! Unoda rubatsiro nei nhasi?",
    "religion": "Iwewe what do you believe?",
    "chit chat": "Welcome to the slang chatbot, ask your question or opinion.",
    "sport": "Mabhora anouraya aya kkk!",
    "deadline": "Deadline yeapplication ndeye November 30, 2025.",
}

# ===============================
# 3. CLASSIFICATION & RESPONSE FUNCTIONS
# ===============================

def classify_intent_shona(text):
    result = shona_classifier(text)
    pred = max(result[0], key=lambda x: x["score"])
    label_index = int(pred["label"].replace("LABEL_", ""))
    intent_name = id2label[label_index]
    return intent_name, pred["score"]

# First model: Intent → fixed response
def chatbot_response(user_input):
    intent, confidence = classify_intent_shona(user_input)
    response = intent_to_response.get(intent, "Ndine urombo, handina kunzwisisa. Unogona kudzokorora here?")
    return intent, confidence, response

# Baseline RAG model: Intent + Retrieval + Flan-T5
def generate_answer_with_models(query):
    intent, confidence = classify_intent_shona(query)
    programs = search_programs(query)
    context = "\n".join([f"{name} – {link}" for name, link in programs])
    prompt = f"""You are a helpful assistant that answers questions about graduate programs at Pace University.
The user's intent is: {intent} (confidence: {confidence:.2f}).

Programs:
{context}

Question: {query}

Answer:"""
    answer = baseline_pipeline(prompt, max_new_tokens=150)[0]['generated_text']
    return intent, confidence, answer

# ===============================
# 4. TEST & COMPARE
# ===============================

# Test queries with Shona slang
test_queries = [
    "madii",
    "mune program MBA yeku online",
    "kunita mari PACE?",
    "mune mufundisi here ku Pce?",
    "Pane finance department here?",
    "what programs do you offer"
]

print("\n=== COMPARISON: Shona Slang Chatbot vs Baseline RAG ===\n")
for query in test_queries:
    intent1, conf1, resp1 = chatbot_response(query)
    intent2, conf2, resp2 = generate_answer_with_models(query)

    print(f"📝 Query: {query}")
    print(f"1️⃣ Shona Chatbot → [{intent1} | {conf1:.2f}] {resp1}")
    print(f"2️⃣ Baseline RAG → [{intent2} | {conf2:.2f}] {resp2}")
    print("-" * 100)

# ===============================
# 5. OPTIONAL: Accuracy Evaluation
# ===============================

# If you have ground truth intents for these queries, you can do:
# ground_truth = ["greeting", "deadline", ...]
# Then measure accuracy for both.


Device set to use cuda:0
Device set to use cuda:0



=== COMPARISON: Shona Slang Chatbot vs Baseline RAG ===

📝 Query: Mhoro shamwari, pane here MSc muData Science?
1️⃣ Shona Chatbot → [greeting | 1.00] Mhoro! Unoda rubatsiro nei nhasi?
2️⃣ Baseline RAG → [greeting | 1.00] Mhoro shamwari, pane here MSc muData Science?
----------------------------------------------------------------------------------------------------
📝 Query: mune program MBA yeku online
1️⃣ Shona Chatbot → [chitchat | 1.00] Ndine urombo, handina kunzwisisa. Unogona kudzokorora here?
2️⃣ Baseline RAG → [chitchat | 1.00] www.pace.edu/program/information-systems-mba Investment Management, MBA – https://www.pace.edu/program/investment-management-mba Strategy and International Business, MBA – https://www.pace.edu/program/mba-international-business-and-strategy Marketing Management, MBA – https://www.pace.edu/program/marketing-management-mba Business Analytics, MBA – https://www.pace.edu/program/business-analytics-mba
---------------------------------------------------------

In [None]:
def handle_intent_response(intent, user_input):
    intent = intent.lower()
    user_input_lower = user_input.lower()

    # Model-based intent rules
    if intent == "greeting":
        return "Hesi shamwari! Uri sei hako?"

    elif intent == "finance":
        return "Bata Finance Department panhamba idzi: 646-479-3688"

    elif intent == "religion":
        return "Bata Department reReligion pa: www.religion@pace.edu"

    elif intent == "program_inquiry":
        return "Unoda program ipi? Ndine runyorwa rwezviripo."

    # Keyword-based fallback for 'apply' intent
    elif any(word in user_input_lower for word in ["apply", "nyora", "kunyora", "register", "application"]):
        name = input("PaceBot: Ndokumbirawo zita rako rizere: ")
        education = input("PaceBot: Wakadzidza kupi kare? (e.g., BSc, diploma, etc.): ")
        email = input("PaceBot: Email yako ndeipi?: ")

        # Optional logging
        print(f"\n[INFO] Application received:\nName: {name}\nEducation: {education}\nEmail: {email}")
        return "Waita apply! Chikumbiro chako chakatumirwa kuchikoro. Tichakutumira zvimwe ruzivo 💼"

    return None  # Let RAG take over if nothing matches


while True:
    user_input = input("Iwe: ")

    if is_exit(user_input):
        print("PaceBot: Zvakanaka, tichaonana zvakare! 🎓")
        break

    intent, confidence = classify_intent(user_input)
    special_response = handle_intent_response(intent, user_input)

    special_response = handle_intent_response(intent, user_input)

    if special_response is not None:
       print("PaceBot:", special_response)
    continue


    program_matches = search_programs(user_input)
    response = generate_answer_with_shona_model(user_input, program_matches)
    print("PaceBot:", response)

In [None]:

while True:
    user_input = input("Iwe: ")

    if is_exit(user_input):
        print("PaceBot: Zvakanaka, tichaonana zvakare! 🎓")
        break

    intent, confidence = classify_intent(user_input)
    special_response = handle_intent_response(intent, user_input)

    special_response = handle_intent_response(intent, user_input)

    if special_response is not None:
       print("PaceBot:", special_response)
    continue


    program_matches = search_programs(user_input)
    response = generate_answer_with_shona_model(user_input, program_matches)
    print("PaceBot:", response)