# ============================
# 🔹 1. Install dependencies
# ============================

In [3]:
!pip install -U datasets transformers sentence-transformers faiss-cpu streamlit evaluate pyngrok tqdm




# ============================
# 🔹 2. Imports & Setup
# ============================

In [29]:
import os, re, random, torch
import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm

# ============================
# 🔹 3. Prepare Data (TweetEval: emotion)
# ============================

In [30]:
def clean_tweet(text):
    if text is None: return ""
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [31]:
os.makedirs("data", exist_ok=True)
ds = load_dataset("cardiffnlp/tweet_eval", "emotion")

In [32]:
for split in ["train", "validation", "test"]:
    records = []
    for item in tqdm(ds[split]):
        text = clean_tweet(item["text"])
        label = item["label"]
        label_name = ds[split].features["label"].names[label]
        records.append({"text": text, "label": label, "label_name": label_name})
    df = pd.DataFrame(records)
    df.to_csv(f"data/{split}.csv", index=False)
    print(f"{split} size = {len(df)}")

  0%|          | 0/3257 [00:00<?, ?it/s]

train size = 3257


  0%|          | 0/374 [00:00<?, ?it/s]

validation size = 374


  0%|          | 0/1421 [00:00<?, ?it/s]

test size = 1421


In [33]:
# write corpus = knowledge base (just texts)
train_df = pd.read_csv("data/train.csv")
with open("data/kb.txt", "w", encoding="utf-8") as f:
    for t in train_df["text"].dropna().tolist():
        f.write(t.replace("\n", " ") + "\n")


In [34]:
# few-shot examples
fewshot_examples = train_df.sample(3, random_state=42)[["text","label_name"]].values.tolist()
print("Few-shot examples:", fewshot_examples)


Few-shot examples: [['Cuz even the bible talks about the son coming back with a fiery sword he got from his mother. They just called her a whore in revelations', 'anger'], ['Need advice on how to get out of this rut!!!! needmotivation', 'optimism'], ["Don't ask, you don't get. Apologies if I've offended you. All due respect Alan, I think you've been fed duff info.", 'anger']]


# ============================
# 🔹 4. Define RAG Explainer (with few-shot)
# ============================

In [35]:
from sentence_transformers import SentenceTransformer
import faiss, numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer as HFAutoTokenizer


In [36]:
class RAGExplainer:
    def __init__(self, kb_path="data/kb.txt",
                 embed_model="all-MiniLM-L6-v2",
                 gen_model="distilgpt2",
                 fewshot=None):
        self.kb = [line.strip() for line in open(kb_path, encoding="utf-8") if line.strip()]
        self.embedder = SentenceTransformer(embed_model)
        self.embeddings = self.embedder.encode(self.kb, convert_to_numpy=True, show_progress_bar=True)
        d = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(d)
        self.index.add(self.embeddings)
        self.tokenizer = HFAutoTokenizer.from_pretrained(gen_model)
        self.generator = AutoModelForCausalLM.from_pretrained(gen_model).to("cpu")
        self.fewshot = fewshot or []

    def retrieve(self, query, k=3):
        q_emb = self.embedder.encode([query], convert_to_numpy=True)
        D, I = self.index.search(q_emb, k)
        return [self.kb[i] for i in I[0]]

    def build_prompt(self, tweet, retrieved):
        prompt = "You are an assistant labeling tweet emotions (joy, sadness, anger, love, surprise, fear).\n\n"
        if self.fewshot:
            prompt += "Here are some examples:\n"
            for t, l in self.fewshot:
                prompt += f"Tweet: {t}\nLabel: {l}\nExplanation: This expresses {l}.\n\n"
        prompt += "Context (similar tweets):\n"
        for r in retrieved:
            prompt += f"- {r}\n"
        prompt += f"\nNow analyze this tweet:\nTweet: {tweet}\nLabel:"
        return prompt

    def explain(self, tweet, k=3):
        retrieved = self.retrieve(tweet, k)
        prompt = self.build_prompt(tweet, retrieved)
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.generator.generate(**inputs, max_new_tokens=80, do_sample=True, top_k=50)
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = generated.replace(prompt, "").strip()
        return answer, retrieved

In [49]:
%%writefile rag_explainer.py
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

class RAGExplainer:
    def __init__(self, kb_path="data/kb.txt",
                 embed_model="all-MiniLM-L6-v2",
                 gen_model="distilgpt2",
                 fewshot=None):
        self.kb = [line.strip() for line in open(kb_path, encoding="utf-8") if line.strip()]
        self.embedder = SentenceTransformer(embed_model)
        self.embeddings = self.embedder.encode(self.kb, convert_to_numpy=True, show_progress_bar=True)
        d = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(d)
        self.index.add(self.embeddings)
        self.tokenizer = AutoTokenizer.from_pretrained(gen_model)
        self.generator = AutoModelForCausalLM.from_pretrained(gen_model).to("cpu")
        self.fewshot = fewshot or []

    def retrieve(self, query, k=3):
        q_emb = self.embedder.encode([query], convert_to_numpy=True)
        D, I = self.index.search(q_emb, k)
        return [self.kb[i] for i in I[0]]

    def build_prompt(self, tweet, retrieved):
        prompt = "You are an assistant labeling tweet emotions (joy, sadness, anger, love, surprise, fear).\n\n"
        if self.fewshot:
            prompt += "Here are some examples:\n"
            for t, l in self.fewshot:
                prompt += f"Tweet: {t}\nLabel: {l}\nExplanation: This expresses {l}.\n\n"
        prompt += "Context (similar tweets):\n"
        for r in retrieved:
            prompt += f"- {r}\n"
        prompt += f"\nNow analyze this tweet:\nTweet: {tweet}\nLabel:"
        return prompt

    def explain(self, tweet, k=3):
        retrieved = self.retrieve(tweet, k)
        prompt = self.build_prompt(tweet, retrieved)
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.generator.generate(**inputs, max_new_tokens=80, do_sample=True, top_k=50)
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = generated.replace(prompt, "").strip()
        return answer, retrieved


Writing rag_explainer.py


In [37]:
explainer = RAGExplainer(fewshot=fewshot_examples)


Batches:   0%|          | 0/102 [00:00<?, ?it/s]

# ============================
# 🔹 4. Load Pretrained Classifier (NO TRAINING)
# ============================

In [38]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [40]:
MODEL_NAME = "bhadresh-savani/distilbert-base-uncased-emotion"
OUT_DIR = "distilbert_tweet_emotion"

In [41]:
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
clf = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [42]:
# ============================
# 🔹 5. Test on 10 samples
# ============================

In [43]:
test_df = pd.read_csv("data/test.csv")
sample = test_df.sample(10, random_state=42)


In [44]:
def classify(text):
    inputs = tok(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        logits = clf(**inputs).logits
    pred = int(torch.argmax(logits, dim=-1).cpu().numpy()[0])
    return pred, clf.config.id2label[pred]

In [45]:
correct = 0
for i, row in sample.iterrows():
    text = row["text"]
    pred, label = classify(text)
    exp, docs = explainer.explain(text)
    print(f"\nTweet: {text}")
    print(f"True: {row['label_name']} | Pred: {label}")
    print(f"Few-shot + RAG Answer: {exp}")
    print(f"Retrieved docs: {docs[:2]}")
    if label == row["label_name"]:
        correct += 1

print(f"\nAccuracy on 10 samples = {correct}/10")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: There are parts of you that wants the sadness. Find them out, ask them why
True: sadness | Pred: anger
Few-shot + RAG Answer: anger
Explanation: One part of your heart gets "titled like," you want to talk. Now it's time to talk - that's more than you want to do.
Do you want to change your view of our world? Let us know in the comments!
Here we go.
Retrieved docs: ['Life is too short so dont shoot it in with worries sadness and grief.', "I think sadness is felt very strongly physically and mentally. It feels like it takes over and it's hard to focus at work MHChat"]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: It ruins my frigging night each night at 9pm. Mrs loves it, i've been early to bed for a month.
True: anger | Pred: love
Few-shot + RAG Answer: anger
Explanation: That's all I'll say here that you have.
Do not listen to all these tweets and hope your son never uses it.
Im going to read up here before you go home.
Do not say anything like this that's going to change.
I'm sick and tired and not knowing I'll ever feel the same way.
Retrieved docs: ['We stayed up all night long\\nMade our drinks too strong\\nFeeling ten feet tall\\nRopes swinging into the water\\nIn the middle of the night', "Watched the movie, friend request at 2am awhile ago in a dark cold night and it was one of the bad choices I've ever made. nightmare 😰"]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: i must say, the amount of low-level fighters and gatekeepers running their mouths is ridiculous. jealousy ufc jonjones
True: anger | Pred: anger
Few-shot + RAG Answer: rage
Explanation: I must say, the amount of low-level fighters and gatekeepers running their mouths is ridiculous. jealousy ufc jonjones
Now analyze this tweet:
Tweet: if I've offended you I can help you out.
Let me know what you think and share your thoughts on the issue
[source: #curseful_s.nz
Retrieved docs: ["he'll defend his belt against aldo after its not that hard u grudge holding bitch", 'Luis Ortiz ducked by Ustinov which means fights off, and he left future not looking to bright for Ortiz boxing']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: I need to get out of this little funk so I can write!! writing funk writerslife depressed
True: sadness | Pred: sadness
Few-shot + RAG Answer: anxiety
Explanation: This expresses anger.
Tweet: Need advice on how to get out of this rut!!!! angry needmotivation
- What is this? how can I get out of this rut!!!! depressing needmotivation
-
I'm just so happy I did and then that's how my life life is going. The idea is just great. life gets better and
Retrieved docs: ['Need advice on how to get out of this rut!!!! depressing needmotivation', "Honestly don't know why I'm so unhappy most of the time. I just want it all to stop :( unhappy depression itnevergoes"]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: Does anybody mom annoy them just by talking and she probably not even trying to annoy you 😂
True: anger | Pred: anger
Few-shot + RAG Answer: anger
explanation, this is the most basic. It's simple. I think it's not about what's happening to your life. It's about what you should do to your own life. Think, what can you do to your own life? You should do it. A big lesson in the world of social communication is to follow the rules of engagement. It's really not about anything
Retrieved docs: ["I don't get how people can leave their phone on don't disturb all day...does your mom not threaten you when you don't respond within seconds", 'my momma irritate me asking all these questions like gone 😤']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: I'll be cheering you on from the bench
True: joy | Pred: joy
Few-shot + RAG Answer: anger
Explanation: this expresses anger.
Please add:
- cheering/depression
- cheering/depression
- cheering/depression
- cheering/depression
- cheering/depression
- cheering/depression
- cheering/depression
Again, if you're an idiot and want to express your feelings on another tweet, take a look at this first
Retrieved docs: ['cheering for and', 'cheer up☺️']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: Hating this already 😟
True: anger | Pred: anger
Few-shot + RAG Answer: anger
Explanation: This expresses anger.
#You are trying to understand what is to "pimp me by making me cry"
Label: anger
Explanation: This expresses anger.
So while I love you, I'm a slave and you needn't want to lose me if I're trying to understand, if I'm trying to be too upset to be with
Retrieved docs: ['Im so angry 😂🙃', 'cheer up chuck😘']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: Duty calls. 😧
True: anger | Pred: anger
Few-shot + RAG Answer: anger
Explanation: This expresses anger.
Tweet: Pay attention to the words so you can get the better of me and the better of you.
Let me say it again - take care of yourself and have a very good week together.
Advertisements
Retrieved docs: ['shocking service for your call centre staff this evening. Transfer me and cut me off after waiting forever to speak to someone.', 'No sober weekend 🙂🙂🙂']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Tweet: Just a side thought, everything takes time and effort ... annoyed
True: anger | Pred: anger
Few-shot + RAG Answer: anger
Explanation: This expresses anger.
Talk about this.
Listen to it. It's important to talk to people. Why not go on a tangent about the boy having to play with his mother, or even talk about what happens to his mom?
Label: anger
Explanation: This expresses anger.
Talk about this. And I guess why not go on
Retrieved docs: ["Okay you've annoyed me, you haven't done a good job there at all. furious", "Some questions you get on Twitter make you want to despair. We've been so battered. We complain but aren't convinced things could be better."]

Tweet: Few things more frustrating that organisations who don't have media contact numbers and request you 'fill out our online form' rage
True: anger | Pred: anger
Few-shot + RAG Answer: rage to show disrespect
Extension (similar tweets):
- A new account in Twitter that is now open to all questions. They'll offer you an easy

# ============================
# 🔹 6. Streamlit App
# ============================

In [55]:
%%writefile app.py
import streamlit as st
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from rag_explainer import RAGExplainer

# ===============================
# Settings
# ===============================
MODEL_NAME = "bhadresh-savani/distilbert-base-uncased-emotion"
DATA_DIR = "data"

# ===============================
# Load Model (cached)
# ===============================
@st.cache_resource
def load_model():
    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
    mdl = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    return tok, mdl

tok, clf = load_model()

# ===============================
# Load Few-shot Data
# ===============================
train_df = pd.read_csv(f"{DATA_DIR}/train.csv")
fewshot_examples = train_df.sample(3, random_state=42)[["text","label_name"]].values.tolist()
explainer = RAGExplainer(fewshot=fewshot_examples)

# ===============================
# Streamlit UI
# ===============================
st.set_page_config(
    page_title="TweetEval Emotion + RAG Demo",
    page_icon="💡",
    layout="wide"
)

# Sidebar
st.sidebar.image("https://cdn-icons-png.flaticon.com/512/889/889111.png", width=80)
st.sidebar.title("⚙️ Settings")
st.sidebar.markdown("Play with the RAG-powered emotion classifier.")
k = st.sidebar.slider("Number of retrieved docs", 1, 5, 3)

# Main Title
st.title("💬 TweetEval Emotion Classifier + RAG")
st.write("This demo combines **DistilBERT emotion classification** with **RAG explanations**.")

# Input Box
tweet = st.text_area("✍️ Enter a tweet for analysis:", height=120)

if st.button("🔍 Analyze"):
    if tweet.strip() == "":
        st.warning("Please enter some text first.")
    else:
        # Run classifier
        inputs = tok(tweet, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            logits = clf(**inputs).logits
        pred = int(torch.argmax(logits, dim=-1).cpu().numpy()[0])
        label = clf.config.id2label[pred]

        # Run RAG explainer
        exp, docs = explainer.explain(tweet, k=k)

        # ===============================
        # Results Layout
        # ===============================
        col1, col2 = st.columns([2, 3])

        with col1:
            st.markdown(
                f"""
                <div style="background-color:#f0f9ff;
                            padding:20px; border-radius:12px;
                            box-shadow:0 0 8px rgba(0,0,0,0.1);">
                <h3 style="color:#0b5394;">Prediction</h3>
                <p style="font-size:22px; font-weight:bold; color:#333;">{label}</p>
                </div>
                """,
                unsafe_allow_html=True
            )

        with col2:
            st.markdown(
                f"""
                <div style="background-color:#fff7e6;
                            padding:20px; border-radius:12px;
                            box-shadow:0 0 8px rgba(0,0,0,0.1);">
                <h3 style="color:#b45f06;">Explanation</h3>
                <p style="font-size:16px; color:#444;">{exp}</p>
                </div>
                """,
                unsafe_allow_html=True
            )

        # Retrieved Documents
        st.write("### 📚 Retrieved Documents")
        for i, d in enumerate(docs, 1):
            with st.expander(f"Document {i}"):
                st.write(d)

# Footer
st.markdown("---")
st.markdown(
    "<center><sub>⚡ Powered by DistilBERT + RAG | Designed with Streamlit</sub></center>",
    unsafe_allow_html=True
)


Overwriting app.py


In [56]:
from pyngrok import ngrok


In [59]:
# 🚫 Kill any old Streamlit / ngrok processes
!pkill streamlit || echo "No old streamlit running"
!pkill ngrok || echo "No old ngrok running"

# ▶️ Start Streamlit app fresh on port 8501
!streamlit run app.py --server.port 8501 &>/dev/null&

# 🌍 Connect ngrok to port 8501
from pyngrok import ngrok
public_url = ngrok.connect(8501)
print("✅ Streamlit running at:", public_url)


No old streamlit running
✅ Streamlit running at: NgrokTunnel: "https://5a45af65c6d3.ngrok-free.app" -> "http://localhost:8501"
