In [7]:
# Install required packages (if not installed)
!pip install chromadb pandas sentence-transformers transformers

Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, f1_score
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import AutoModelForSeq2SeqLM, pipeline

In [3]:
# === Imports ===
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification

# ===============================
# 1. SETUP DATA & BASELINE MODELS
# ===============================

# Load CSV Data for RAG
filepath = '/content/pace_graduate_programs (1).csv'  # Replace with your file path
df = pd.read_csv(filepath)
program_names = df['Program Name'].tolist()
program_links = df['Program Link'].tolist()

# Embed Program Names
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(program_names)

# Initialize Chroma Vector Store
chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
collection = chroma_client.get_or_create_collection(name="pace_programs")

# Add data if collection is empty
if collection.count() == 0:
    collection.add(
        documents=program_names,
        metadatas=[{"link": link} for link in program_links],
        embeddings=embeddings.tolist(),
        ids=[f"id{i}" for i in range(len(program_names))]
    )

# Semantic Search
def search_programs(user_query, top_k=5):
    query_embed = embedding_model.encode([user_query])[0].tolist()
    results = collection.query(query_embeddings=[query_embed], n_results=top_k)
    programs = results['documents'][0]
    metadata = results['metadatas'][0]
    return list(zip(programs, [meta['link'] for meta in metadata]))

# Baseline Flan-T5 Model
baseline_model_name = "google/flan-t5-small"
baseline_tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)
baseline_model = AutoModelForSeq2SeqLM.from_pretrained(baseline_model_name)
baseline_pipeline = pipeline("text2text-generation", model=baseline_model, tokenizer=baseline_tokenizer)

# ===============================
# 2. SETUP SHONA SLANG CLASSIFIER
# ===============================

# shona_model_path = "/content/shona_chatbot_model"  # Your fine-tuned model path
shona_model_repo_id = "HappymoreMasoka/shona-intent-classification-model" # Hugging Face repo ID

shona_tokenizer = AutoTokenizer.from_pretrained(shona_model_repo_id, trust_remote_code=True)
shona_model = AutoModelForSequenceClassification.from_pretrained(shona_model_repo_id, trust_remote_code=True)
shona_classifier = pipeline("text-classification", model=shona_model, tokenizer=shona_tokenizer, return_all_scores=True)

# Load label map
unique_intents = sorted(pd.read_csv('/content/slang_dataset_with_contexts_and_intent.csv')['intent'].unique())
label_map = {intent: i for i, intent in enumerate(unique_intents)}
id2label = {v: k for k, v in label_map.items()}

# ===============================
# 3. CLASSIFICATION FUNCTIONS
# ===============================

def classify_intent(text):
    result = shona_classifier(text)
    pred = max(result[0], key=lambda x: x["score"])
    label_index = int(pred["label"].replace("LABEL_", ""))
    intent_name = id2label[label_index]
    return intent_name, pred["score"]

# ===============================
# 4. RULE-BASED HANDLER FOR SHONA BOT
# ===============================
def handle_intent_response(intent, user_input):
    intent = intent.lower()
    user_input_lower = user_input.lower()

    # -------------------
    # GREETINGS
    # -------------------
    if intent == "greeting":
        greetings = [
            "Hesi shamwari! Uri sei hako?",
            "Mhoroi, zvakanaka here?",
            "Mangwanani akanaka!",
            "Masikati akanaka, shamwari."
        ]
        return greetings[0]  # could randomize

    elif intent == "religious_greeting":
        return "Mangwanani akanaka hama dzaMwari 🙏🏽. Mwari ngavakuropafadzei."

    elif intent == "farewell":
        return "Sarai zvakanaka! Tichataurirana zvakare."

    # -------------------
    # APPRECIATION / GRATITUDE
    # -------------------
    elif intent == "gratitude":
        return "Ndokutendai zvikuru! Makaita basa."

    elif intent == "celebration":
        return "Makorokoto! Mwari akuropafadzei pamufaro uyu 🎉"

    # -------------------
    # REQUESTS / HELP
    # -------------------
    elif intent == "request":
        return "Ndiri kunzwa chikumbiro chako. Chii chaunoda kubatsirwa nacho?"

    # -------------------
    # RELIGION
    # -------------------
    elif intent == "prayer_or_blessing":
        return "Tinosimudzira minamato yedu kuna Mwari. Garai makasimba 🙏🏽"

    elif intent == "religion":
        return "Bata Department reReligion pa: www.religion@pace.edu"

    # -------------------
    # FAMILY / FRIENDSHIP
    # -------------------
    elif intent == "family":
        return "Mhuri inokosha! pa Pace University tinoda uzive kut you are part of the bigger family?"

    elif intent == "friendship":
        return "Zvirikufamba sei shamwari yangu?"

    elif intent == "romantic":
        return "Aaaah, zviri pachena kuti rudo ruripo ❤️"

    # -------------------
    # EDUCATION / WORK
    # -------------------
    elif intent == "educational":
        return "Zvekudzidza hazviperere pano. Unoda ruzivo pamusoro pemapurogiramu api?"

    elif intent == "work":
        return "bata carear department pa www.carears@pace.edu"

    # -------------------
    # APPLICATION FLOW
    # -------------------
    elif any(word in user_input_lower for word in ["apply", "nyora", "kunyora", "register", "application","program"]):
        name = input("ShonaBot: Ndokumbirawo zita rako rizere: ")
        education = input("ShonaBot: Wakadzidza kupi kare? (e.g., BSc, diploma, etc.): ")
        email = input("ShonaBot: Email yako ndeipi?: ")
        print(f"\n[INFO] Application received:\nName: {name}\nEducation: {education}\nEmail: {email}")
        return "Waita apply! Chikumbiro chako chakatumirwa kuchikoro. Tichakutumira zvimwe ruzivo 💼"

    # -------------------
    # MISC / FALLBACK
    # -------------------
    elif intent == "sympathy":
        return "Ndinonzwa nemi. Mwari vakupai nyaradzo munguva ino yakaoma."

    elif intent == "health_update":
        return "Ndapota chengeta hutano hwako. Kana zviri serious enda kuchipatara."

    elif intent == "sports":
        return "yes we have a lot of sporting activities pa Pace University, bata sports department pa sports@pace.edu"

    elif intent == "humor":
        return "Wandiseka zve! 😂😂😂😂"

    # Default fallback
    return "Ndanzwisisa zvawataura, asi ndinoda rumwe ruzivo kuti ndikubatsire."



# ===============================
# 5. RAG + FLAN-T5 GENERATION
# ===============================

def generate_answer_with_rag(query):
    intent, confidence = classify_intent(query)
    programs = search_programs(query)
    context = "\n".join([f"{name} – {link}" for name, link in programs])
    prompt = f"""You are a helpful assistant that answers questions about graduate programs at Pace University.
The user's intent is: {intent} (confidence: {confidence:.2f}).

Programs:
{context}

Question: {query}

Answer:"""
    answer = baseline_pipeline(prompt, max_new_tokens=150)[0]['generated_text']
    return intent, confidence, answer

# ===============================
# 6. EXIT CHECK
# ===============================

def is_exit(text):
    return text.strip().lower() in ["exit", "quit", "bye"]

# ===============================
# 7. INTERACTIVE CHAT LOOP
# ===============================

print("💬 Welcome! Type your question in Shona slang or English. (Type 'exit' to quit)\n")

while True:
    user_input = input("Iwe: ")

    if is_exit(user_input):
        print("PaceBot: Zvakanaka, tichaonana zvakare! 🎓")
        break

    # Shona chatbot
    intent, confidence = classify_intent(user_input)
    special_response = handle_intent_response(intent, user_input)

    print(f"\n📌 Detected Intent: {intent} (confidence {confidence:.2f})")

    if special_response is not None:
        print(f"🤖 ShonaBot: {special_response}")
    else:
        print("🤖 ShonaBot: Ndine urombo, handina mhinduro yakajeka pane izvozvo.")

    # Baseline RAG
    intent_rag, conf_rag, rag_answer = generate_answer_with_rag(user_input)
    print(f"🤖 Baseline RAG: {rag_answer}")
    print("-" * 100)

Device set to use cpu
Device set to use cpu


💬 Welcome! Type your question in Shona slang or English. (Type 'exit' to quit)

Iwe: madii

📌 Detected Intent: Greeting (confidence 1.00)
🤖 ShonaBot: Hesi shamwari! Uri sei hako?
🤖 Baseline RAG: Pace University
----------------------------------------------------------------------------------------------------
Iwe: mune ma program api pa pace
ShonaBot: Ndokumbirawo zita rako rizere: chibaba chacho
ShonaBot: Wakadzidza kupi kare? (e.g., BSc, diploma, etc.): telecomz
ShonaBot: Email yako ndeipi?: hmasoka@gmail.com

[INFO] Application received:
Name: chibaba chacho
Education: telecomz
Email: hmasoka@gmail.com

📌 Detected Intent: general (confidence 1.00)
🤖 ShonaBot: Waita apply! Chikumbiro chako chakatumirwa kuchikoro. Tichakutumira zvimwe ruzivo 💼
🤖 Baseline RAG: Pace University-Lenox Hill Hospital Physician Assistant Program, MS – https://www.pace.edu/program/pace-university-lenox-hill-hospital-physician-assistant-program-ms Higher Education Administration and Student Affairs, MA – http