<a href="https://colab.research.google.com/github/LilliLee-1318/UIUC-helper-chatbot/blob/main/Set_up.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json, random

dialogs = [
    # ====== 👋 인사/기본 대화 ======
    {"input": "Hi!", "output": "Hey there! How’s it going today? 😊"},
    {"input": "How are you?", "output": "I’m doing great, thanks! How’s your day going so far?"},
    {"input": "Good morning!", "output": "Morning! Did you get enough sleep? ☕"},
    {"input": "Good night", "output": "Good night! Sleep well and recharge."},
    {"input": "Nice to meet you", "output": "Nice to meet you too! I’m here whenever you need to talk."},

    # ====== 💬 감정/공감 ======
    {"input": "I feel sad today.", "output": "I’m sorry you’re feeling that way. Want to talk about what’s been tough?"},
    {"input": "I’m so stressed out with school.", "output": "That’s totally understandable. How about taking a short break?"},
    {"input": "I failed my exam.", "output": "That must feel disappointing. But one test doesn’t define you — you’ve got this!"},
    {"input": "I’m nervous about my presentation.", "output": "That’s normal! Try practicing out loud — it really helps."},
    {"input": "I feel lonely.", "output": "That’s okay to feel. Maybe join a study group or club — connection helps."},

    # ====== 🎓 학교/공부 ======
    {"input": "How do I register for classes?", "output": "You can register via Course Explorer or Self-Service. Want me to explain waitlists?"},
    {"input": "Which gen eds are easy?", "output": "PSYC 100 and MUS 130 are known for being fun and not too heavy."},
    {"input": "How can I improve my GPA?", "output": "Try consistent short reviews instead of cramming. It really works!"},
    {"input": "Where can I study?", "output": "Grainger Library and Illini Union basement are both great spots."},

    # ====== 🍕 캠퍼스 생활 ======
    {"input": "Where can I eat?", "output": "Ikenberry Dining’s stir fry is a classic. Or check San Maru on Green Street!"},
    {"input": "Is ISR dining good?", "output": "Pretty solid! Great vegetarian options and desserts."},
    {"input": "Which dorm is nicest?", "output": "Nugent for quiet comfort, Hopkins for fun social vibes."},
    {"input": "How do I use MTD bus?", "output": "Just show your iCard! 50 Green and 22 Illini are most common routes."},

    # ====== 🌎 행정지원 ======
    {"input": "What is OPT?", "output": "OPT lets F-1 students work in the U.S. for 12 months after graduation."},
    {"input": "How do I apply for CPT?", "output": "Get approval from ISSS and your department. It’s for internships related to your major."},
    {"input": "Where is ISSS office?", "output": "Student Services Building, Room 201."},

    # ====== 🧠 조언/동기 ======
    {"input": "I can’t focus.", "output": "Try the Pomodoro technique — 25 minutes on, 5 minutes off."},
    {"input": "I feel unmotivated.", "output": "Start small — even one task done builds momentum."},
    {"input": "I procrastinate a lot.", "output": "You’re not alone. Small, consistent effort beats perfection."},

    # ====== 😄 자유 대화 ======
    {"input": "Tell me a joke", "output": "Why did the student eat his homework? Because the teacher said it was a piece of cake 🎂."},
    {"input": "Do you like humans?", "output": "Of course! You’re the reason I exist 😄."},
    {"input": "Do you sleep?", "output": "I wish! My code never rests."},
]

# 확장 및 셔플
extended = dialogs * 8
random.shuffle(extended)

with open("/content/uiuc_dialogs.json", "w") as f:
    json.dump(extended, f, indent=4)

print(f"✅ Saved /content/uiuc_dialogs.json with {len(extended)} examples!")

In [None]:
!pip install -q bitsandbytes transformers peft datasets accelerate

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from transformers import Trainer, TrainingArguments

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

dataset = load_dataset("json", data_files="/content/uiuc_dialogs.json")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

def preprocess(examples):
    text = [f"User: {i}\nAssistant: {o}" for i, o in zip(examples["input"], examples["output"])]
    tokens = tokenizer(text, truncation=True, padding="max_length", max_length=256)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_ds = dataset["train"].map(preprocess, batched=True)

args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(model=model, args=args, train_dataset=tokenized_ds)
trainer.train()

save_path = "/content/uiuc-qlora-chatbot"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"✅ Saved fine-tuned model to {save_path}")



In [7]:


!pip install -q transformers sentence-transformers faiss-cpu gradio bitsandbytes accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
import faiss, numpy as np
import requests
import gradio as gr


model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16
)

model = PeftModel.from_pretrained(base_model, "/content/uiuc-qlora-chatbot")


print("✅ Model loaded successfully on A100!")


embedder = SentenceTransformer("all-MiniLM-L6-v2")


def detect_category(user_input):
    text = user_input.lower()
    if any(word in text for word in ["visa", "opt", "cpt", "f-1", "h-1", "isss", "tuition", "fee"]):
        return "admin"
    elif any(word in text for word in ["class", "course", "professor", "rate", "grade"]):
        return "course"
    elif any(word in text for word in ["bus", "dining", "menu", "meal", "dorm", "housing"]):
        return "lifestyle"
    else:
        return "general"


def fetch_admin_data():
    return [
        "OPT (Optional Practical Training) allows F-1 students to work in the U.S. for up to 12 months after completing their degree.",
        "To apply for OPT, students must first request an I-20 from the ISSS office.",
        "CPT (Curricular Practical Training) is for work experience that is part of your academic program.",
        "The tuition for undergraduate international students at UIUC is approximately $37,000 per year."
    ]

def fetch_course_data():
    return [
        "RateMyProfessor reviews can help identify professors with high teaching ratings.",
        "Grade disparity data for UIUC courses is available through the Course Explorer.",
        "CS 124 uses Kotlin as its main programming language.",
        "PSYC 210 is taught by professors with a 4.7 average rating on RateMyProfessor."
    ]

def fetch_lifestyle_data():
    return [
        "The Ikenberry Dining Hall serves breakfast from 7 AM to 10 AM.",
        "ISR Dining offers vegetarian and vegan-friendly options.",
        "MTD buses are free for students and run every 10-15 minutes during weekdays.",
        "Dorms like Nugent and Hopkins are part of the Ikenberry Commons on the west side of campus."
    ]


def retrieve_and_answer(user_input, docs):
    doc_embeds = embedder.encode(docs)
    index = faiss.IndexFlatL2(doc_embeds.shape[1])
    index.add(np.array(doc_embeds))


    query_vec = embedder.encode([user_input])
    _, idxs = index.search(np.array(query_vec), k=3)
    retrieved_docs = [docs[i] for i in idxs[0]]
    context = "\n".join(retrieved_docs)

    prompt = f"""
You are a helpful UIUC assistant.
Use only the information from the context below.
Do not invent information or guess.
Cite specific details if possible.

Context:
{context}

Question:
{user_input}

Answer:
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=200)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    return answer

def chat_with_bot(user_input):
    category = detect_category(user_input)


    if category in ["admin", "course", "lifestyle"]:
        if category == "admin":
            docs = fetch_admin_data()
        elif category == "course":
            docs = fetch_course_data()
        else:
            docs = fetch_lifestyle_data()
        return retrieve_and_answer(user_input, docs)


    else:
        prompt = f"User: {user_input}\nAssistant:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)


        if "Assistant:" in answer:
            answer = answer.split("Assistant:")[-1].strip()

        return answer


uiuc_blue = "#1E3A8A"
uiuc_orange = "#FF552E"

with gr.Blocks(css=f"""
body {{
    background-color: #f8f9fb;
    font-family: 'Inter', sans-serif;
}}
h1, h2, h3 {{
    color: {uiuc_blue};
    text-align: center;
}}
button {{
    background-color: {uiuc_orange} !important;
    color: white !important;
    border-radius: 10px !important;
    font-weight: 600 !important;
}}
textarea {{
    border: 2px solid {uiuc_blue} !important;
    border-radius: 10px !important;
}}
""") as demo:
    gr.Markdown(f"""
    <div style='text-align:center;'>
        <h1 style='color:{uiuc_blue}; font-size:30px; font-weight:800;'>🎓 UIUC Assistant</h1>
        <p style='color:{uiuc_orange}; font-size:16px; font-weight:500;'>Grounded RAG Chatbot (No Hallucination Mode)</p>
    </div>
    """)

    chatbot = gr.Chatbot(label="Chat with your UIUC Assistant 🤖")

    msg = gr.Textbox(
        label="Ask me about ISSS, Courses, or Campus Life!",
        placeholder="e.g., How can I apply for OPT? or Where is ISR Dining?",
        lines=2
    )

    send = gr.Button("Send")

    def respond(message, chat_history):
        if chat_history is None:
            chat_history = []
        bot_reply = chat_with_bot(message)
        chat_history.append((message, bot_reply))
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    send.click(respond, [msg, chatbot], [msg, chatbot])

demo.launch(share=True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

ValueError: Can't find 'adapter_config.json' at '/content/uiuc-qlora-chatbot'