# ============================
# 🔹 1. Install dependencies
# ============================

In [1]:
!pip install -U datasets transformers sentence-transformers faiss-cpu streamlit evaluate pyngrok tqdm


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# ============================
# 🔹 2. Imports & Setup
# ============================

In [2]:
import os, re, random, torch
import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm

# ============================
# 🔹 3. Prepare Data (TweetEval: emotion)
# ============================

In [3]:
def clean_tweet(text):
    if text is None: return ""
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [4]:
os.makedirs("data", exist_ok=True)
ds = load_dataset("cardiffnlp/tweet_eval", "emotion")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/105k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/28.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

In [5]:
for split in ["train", "validation", "test"]:
    records = []
    for item in tqdm(ds[split]):
        text = clean_tweet(item["text"])
        label = item["label"]
        label_name = ds[split].features["label"].names[label]
        records.append({"text": text, "label": label, "label_name": label_name})
    df = pd.DataFrame(records)
    df.to_csv(f"data/{split}.csv", index=False)
    print(f"{split} size = {len(df)}")

  0%|          | 0/3257 [00:00<?, ?it/s]

train size = 3257


  0%|          | 0/374 [00:00<?, ?it/s]

validation size = 374


  0%|          | 0/1421 [00:00<?, ?it/s]

test size = 1421


In [6]:
# write corpus = knowledge base (just texts)
train_df = pd.read_csv("data/train.csv")
with open("data/kb.txt", "w", encoding="utf-8") as f:
    for t in train_df["text"].dropna().tolist():
        f.write(t.replace("\n", " ") + "\n")


In [7]:
# few-shot examples
fewshot_examples = train_df.sample(3, random_state=42)[["text","label_name"]].values.tolist()
print("Few-shot examples:", fewshot_examples)


Few-shot examples: [['Cuz even the bible talks about the son coming back with a fiery sword he got from his mother. They just called her a whore in revelations', 'anger'], ['Need advice on how to get out of this rut!!!! needmotivation', 'optimism'], ["Don't ask, you don't get. Apologies if I've offended you. All due respect Alan, I think you've been fed duff info.", 'anger']]


# ============================
# 🔹 4. Define RAG Explainer (with few-shot)
# ============================

In [8]:
from sentence_transformers import SentenceTransformer
import faiss, numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer as HFAutoTokenizer


In [9]:
class RAGExplainer:
    def __init__(self, kb_path="data/kb.txt",
                 embed_model="all-MiniLM-L6-v2",
                 gen_model="distilgpt2",
                 fewshot=None):
        self.kb = [line.strip() for line in open(kb_path, encoding="utf-8") if line.strip()]
        self.embedder = SentenceTransformer(embed_model)
        self.embeddings = self.embedder.encode(self.kb, convert_to_numpy=True, show_progress_bar=True)
        d = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(d)
        self.index.add(self.embeddings)
        self.tokenizer = HFAutoTokenizer.from_pretrained(gen_model)
        self.generator = AutoModelForCausalLM.from_pretrained(gen_model).to("cpu")
        self.fewshot = fewshot or []

    def retrieve(self, query, k=3):
        q_emb = self.embedder.encode([query], convert_to_numpy=True)
        D, I = self.index.search(q_emb, k)
        return [self.kb[i] for i in I[0]]

    def build_prompt(self, tweet, retrieved):
        prompt = "You are an assistant labeling tweet emotions (joy, sadness, anger, love, surprise, fear).\n\n"
        if self.fewshot:
            prompt += "Here are some examples:\n"
            for t, l in self.fewshot:
                prompt += f"Tweet: {t}\nLabel: {l}\nExplanation: This expresses {l}.\n\n"
        prompt += "Context (similar tweets):\n"
        for r in retrieved:
            prompt += f"- {r}\n"
        prompt += f"\nNow analyze this tweet:\nTweet: {tweet}\nLabel:"
        return prompt

    def explain(self, tweet, k=3):
        retrieved = self.retrieve(tweet, k)
        prompt = self.build_prompt(tweet, retrieved)
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.generator.generate(**inputs, max_new_tokens=80, do_sample=True, top_k=50)
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = generated.replace(prompt, "").strip()
        return answer, retrieved

In [10]:
%%writefile rag_explainer.py
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

class RAGExplainer:
    def __init__(self, kb_path="data/kb.txt",
                 embed_model="all-MiniLM-L6-v2",
                 gen_model="distilgpt2",
                 fewshot=None):
        self.kb = [line.strip() for line in open(kb_path, encoding="utf-8") if line.strip()]
        self.embedder = SentenceTransformer(embed_model)
        self.embeddings = self.embedder.encode(self.kb, convert_to_numpy=True, show_progress_bar=True)
        d = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(d)
        self.index.add(self.embeddings)
        self.tokenizer = AutoTokenizer.from_pretrained(gen_model)
        self.generator = AutoModelForCausalLM.from_pretrained(gen_model).to("cpu")
        self.fewshot = fewshot or []

    def retrieve(self, query, k=3):
        q_emb = self.embedder.encode([query], convert_to_numpy=True)
        D, I = self.index.search(q_emb, k)
        return [self.kb[i] for i in I[0]]

    def build_prompt(self, tweet, retrieved):
        prompt = "You are an assistant labeling tweet emotions (joy, sadness, anger, love, surprise, fear).\n\n"
        if self.fewshot:
            prompt += "Here are some examples:\n"
            for t, l in self.fewshot:
                prompt += f"Tweet: {t}\nLabel: {l}\nExplanation: This expresses {l}.\n\n"
        prompt += "Context (similar tweets):\n"
        for r in retrieved:
            prompt += f"- {r}\n"
        prompt += f"\nNow analyze this tweet:\nTweet: {tweet}\nLabel:"
        return prompt

    def explain(self, tweet, k=3):
        retrieved = self.retrieve(tweet, k)
        prompt = self.build_prompt(tweet, retrieved)
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.generator.generate(**inputs, max_new_tokens=80, do_sample=True, top_k=50)
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = generated.replace(prompt, "").strip()
        return answer, retrieved


Writing rag_explainer.py


In [11]:
explainer = RAGExplainer(fewshot=fewshot_examples)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/102 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# ============================
# 🔹 4. Load Pretrained Classifier (NO TRAINING)
# ============================

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [13]:
MODEL_NAME = "bhadresh-savani/distilbert-base-uncased-emotion"
OUT_DIR = "distilbert_tweet_emotion"

In [14]:
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
clf = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [15]:
# ============================
# 🔹 5. Test on 10 samples
# ============================

In [16]:
test_df = pd.read_csv("data/test.csv")
sample = test_df.sample(10, random_state=42)


In [17]:
def classify(text):
    inputs = tok(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        logits = clf(**inputs).logits
    pred = int(torch.argmax(logits, dim=-1).cpu().numpy()[0])
    return pred, clf.config.id2label[pred]

In [18]:
correct = 0
for i, row in sample.iterrows():
    text = row["text"]
    pred, label = classify(text)
    exp, docs = explainer.explain(text)
    print(f"\nTweet: {text}")
    print(f"True: {row['label_name']} | Pred: {label}")
    print(f"Few-shot + RAG Answer: {exp}")
    print(f"Retrieved docs: {docs[:2]}")
    if label == row["label_name"]:
        correct += 1

print(f"\nAccuracy on 10 samples = {correct}/10")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: There are parts of you that wants the sadness. Find them out, ask them why
True: sadness | Pred: anger
Few-shot + RAG Answer: despair
explanation: This expresses anger but does not hurt as you try to focus on them. It doesn't hurt as well, but it does allow you to feel like one's emotions - anger, sadness
- I want you to give some credit for my work. If you need help then maybe it's you.
Label: sadness
Explanation: You know who you are
Retrieved docs: ['Life is too short so dont shoot it in with worries sadness and grief.', "I think sadness is felt very strongly physically and mentally. It feels like it takes over and it's hard to focus at work MHChat"]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: It ruins my frigging night each night at 9pm. Mrs loves it, i've been early to bed for a month.
True: anger | Pred: love
Few-shot + RAG Answer: hope
Explanation: This expresses angst.
Tweet: We stay in this bad light as our day goes on.
Now analyze this tweet:
Tweet: It ruins your frigging night every night at 9pm. Mrs loves it, i've been early to bed for a month.
Now analyze this tweet:
Tweet: It ruins my frigging night every night at 9pm
Retrieved docs: ['We stayed up all night long\\nMade our drinks too strong\\nFeeling ten feet tall\\nRopes swinging into the water\\nIn the middle of the night', "Watched the movie, friend request at 2am awhile ago in a dark cold night and it was one of the bad choices I've ever made. nightmare 😰"]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: i must say, the amount of low-level fighters and gatekeepers running their mouths is ridiculous. jealousy ufc jonjones
True: anger | Pred: anger
Few-shot + RAG Answer: anger
Explanation: That's why you should put up a banner that says "FOUR years of good times for a country!"
You should show up, stop calling attention to what people say to you. No matter what you say or don't, no matter what it may imply to you, nobody cares.
You should show up, stop calling attention to what people say to you
Retrieved docs: ["he'll defend his belt against aldo after its not that hard u grudge holding bitch", 'Luis Ortiz ducked by Ustinov which means fights off, and he left future not looking to bright for Ortiz boxing']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: I need to get out of this little funk so I can write!! writing funk writerslife depressed
True: sadness | Pred: sadness
Few-shot + RAG Answer: emotion
Explanation: This expresses anger.
As a writer I don't often think about life or thinking about it again. I would rather be writing all positive and positive. I would rather have my mind and my feelings open and honest.
I like music...
Retrieved docs: ['Need advice on how to get out of this rut!!!! depressing needmotivation', "Honestly don't know why I'm so unhappy most of the time. I just want it all to stop :( unhappy depression itnevergoes"]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: Does anybody mom annoy them just by talking and she probably not even trying to annoy you 😂
True: anger | Pred: anger
Few-shot + RAG Answer: anger
Explanation: This expresses anger.
Tweet: I don't understand how it is. Have your mom make a stupid life decision because they want to hurt you but I can't deal with that now they are calling you "happily" because I really don't understand.
(But what does this mean for me to understand it?)
(I see this in a tweet
Retrieved docs: ["I don't get how people can leave their phone on don't disturb all day...does your mom not threaten you when you don't respond within seconds", 'my momma irritate me asking all these questions like gone 😤']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: I'll be cheering you on from the bench
True: joy | Pred: joy
Few-shot + RAG Answer: anxiety
Explanation: This expresses anger.
Tweet: Don't ask, you don't get. Apologies if I've offended you. All due respect Alan, I think you've been fed duff info.
Look at the comments on below.
I have never been an employee of one. It's just my job. Look at you and ask for advice on how to get
Retrieved docs: ['cheering for and', 'cheer up☺️']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: Hating this already 😟
True: anger | Pred: anger
Few-shot + RAG Answer: anger
Explanation: This expresses anger.
Tweets: a great comment
Tweet: I'm gonna get on a phone (for the sake of peace 😔)
- the phone is gonna be okay 😎
This tweet was created just as a response to a tweet from @CrazyJitch (not an insult, I'm sorry).
Retrieved docs: ['Im so angry 😂🙃', 'cheer up chuck😘']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: Duty calls. 😧
True: anger | Pred: anger
Few-shot + RAG Answer: anger
explanation: Please help me. Go!
- I should be back within the month or something. This is an email I need that I can't be fired at because he calls me a whore and blames me on my child being a whore.
- He wants to know if it works out, if he doesn't go on to do any hard work. And I need to
Retrieved docs: ['shocking service for your call centre staff this evening. Transfer me and cut me off after waiting forever to speak to someone.', 'No sober weekend 🙂🙂🙂']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: Just a side thought, everything takes time and effort ... annoyed
True: anger | Pred: anger
Few-shot + RAG Answer: anger
This expression is different from words you're used to:
This expression is different than words you're used to:
You're not upset, no matter what happened, because the thought will change.
Now analyze this tweet:
Tweet: You could think this is true, but the thought will change at the moment.
The last words of note (that's the second, but
Retrieved docs: ["Okay you've annoyed me, you haven't done a good job there at all. furious", "Some questions you get on Twitter make you want to despair. We've been so battered. We complain but aren't convinced things could be better."]

Tweet: Few things more frustrating that organisations who don't have media contact numbers and request you 'fill out our online form' rage
True: anger | Pred: anger
Few-shot + RAG Answer: rage
Explanation: This expresses anger.
Tweet: You're on the way, do no harm with this tweet. Why do you 

# ============================
# 🔹 6. Streamlit App
# ============================

In [19]:
%%writefile app.py
import streamlit as st
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from rag_explainer import RAGExplainer

# ===============================
# Settings
# ===============================
MODEL_NAME = "bhadresh-savani/distilbert-base-uncased-emotion"
DATA_DIR = "data"

# ===============================
# Load Model (cached)
# ===============================
@st.cache_resource
def load_model():
    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
    mdl = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    return tok, mdl

tok, clf = load_model()

# ===============================
# Load Few-shot Data
# ===============================
train_df = pd.read_csv(f"{DATA_DIR}/train.csv")
fewshot_examples = train_df.sample(3, random_state=42)[["text","label_name"]].values.tolist()
explainer = RAGExplainer(fewshot=fewshot_examples)

# ===============================
# Streamlit UI
# ===============================
st.set_page_config(
    page_title="TweetEval Emotion + RAG Demo",
    page_icon="💡",
    layout="wide"
)

# Sidebar
st.sidebar.image("https://cdn-icons-png.flaticon.com/512/889/889111.png", width=80)
st.sidebar.title("⚙️ Settings")
st.sidebar.markdown("Play with the RAG-powered emotion classifier.")
k = st.sidebar.slider("Number of retrieved docs", 1, 5, 3)

# Main Title
st.title("💬 TweetEval Emotion Classifier + RAG")
st.write("This demo combines **DistilBERT emotion classification** with **RAG explanations**.")

# Input Box
tweet = st.text_area("✍️ Enter a tweet for analysis:", height=120)

if st.button("🔍 Analyze"):
    if tweet.strip() == "":
        st.warning("Please enter some text first.")
    else:
        # Run classifier
        inputs = tok(tweet, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            logits = clf(**inputs).logits
        pred = int(torch.argmax(logits, dim=-1).cpu().numpy()[0])
        label = clf.config.id2label[pred]

        # Run RAG explainer
        exp, docs = explainer.explain(tweet, k=k)

        # ===============================
        # Results Layout
        # ===============================
        col1, col2 = st.columns([2, 3])

        with col1:
            st.markdown(
                f"""
                <div style="background-color:#f0f9ff;
                            padding:20px; border-radius:12px;
                            box-shadow:0 0 8px rgba(0,0,0,0.1);">
                <h3 style="color:#0b5394;">Prediction</h3>
                <p style="font-size:22px; font-weight:bold; color:#333;">{label}</p>
                </div>
                """,
                unsafe_allow_html=True
            )

        with col2:
            st.markdown(
                f"""
                <div style="background-color:#fff7e6;
                            padding:20px; border-radius:12px;
                            box-shadow:0 0 8px rgba(0,0,0,0.1);">
                <h3 style="color:#b45f06;">Explanation</h3>
                <p style="font-size:16px; color:#444;">{exp}</p>
                </div>
                """,
                unsafe_allow_html=True
            )

        # Retrieved Documents
        st.write("### 📚 Retrieved Documents")
        for i, d in enumerate(docs, 1):
            with st.expander(f"Document {i}"):
                st.write(d)

# Footer
st.markdown("---")
st.markdown(
    "<center><sub>⚡ Powered by DistilBERT + RAG | Designed with Streamlit</sub></center>",
    unsafe_allow_html=True
)


Writing app.py


In [20]:
from pyngrok import ngrok


In [24]:
# 🔑 Configure your ngrok token (run once)
NGROK_AUTH_TOKEN = "32TbcyNOwobHCKnVgfhR54H0yuQ_yXh7Z9eCpHBDivLN9TVP"

from pyngrok import ngrok
ngrok.set_auth_token(NGROK_AUTH_TOKEN)


In [25]:
# 🚫 Kill any old Streamlit / ngrok processes
!pkill streamlit || echo "No old streamlit running"
!pkill ngrok || echo "No old ngrok running"

# ▶️ Start Streamlit app fresh on port 8501
!streamlit run app.py --server.port 8501 &>/dev/null&

# 🌍 Connect ngrok to port 8501
from pyngrok import ngrok
public_url = ngrok.connect(8501)
print("✅ Streamlit running at:", public_url)


No old ngrok running
✅ Streamlit running at: NgrokTunnel: "https://82acf7604395.ngrok-free.app" -> "http://localhost:8501"
