📘 PHASE 1: Install Required Libraries

In [None]:
!pip install -Uqq transformers==4.54.1 datasets wandb streamlit pyngrok

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h

📘 PHASE 2: Load Dataset (CSV/TXT)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load dataset
df_train = pd.read_csv('/content/drive/MyDrive/motion_detector_dataset/train.txt', sep=';', header=None, names=['text', 'label'])
df_val = pd.read_csv('/content/drive/MyDrive/motion_detector_dataset/val.txt', sep=';', header=None, names=['text', 'label'])
df_test = pd.read_csv('/content/drive/MyDrive/motion_detector_dataset/test.txt', sep=';', header=None, names=['text', 'label'])

df_train.columns = df_train.columns.str.lower()
df_val.columns = df_val.columns.str.lower()
df_test.columns = df_test.columns.str.lower()

df_train.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


📘 PHASE 3: Preprocessing & Tokenization


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

# Convert to Hugging Face datasets
train_ds = Dataset.from_pandas(df_train)
val_ds = Dataset.from_pandas(df_val)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize function
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# Apply tokenization
tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_val_ds = val_ds.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

📘 PHASE 4: Label Encoding

In [None]:
tokenized_train_ds = tokenized_train_ds.class_encode_column("label")
tokenized_val_ds = tokenized_val_ds.class_encode_column("label")

Casting to class labels:   0%|          | 0/16000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/2000 [00:00<?, ? examples/s]

📘 PHASE 5: Model & Training

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

# Training args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_dir="./logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds
)

# Train
trainer.train()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2372,0.203303


Epoch,Training Loss,Validation Loss
1,0.2372,0.203303
2,0.1348,0.154045


TrainOutput(global_step=4000, training_loss=0.2622231330871582, metrics={'train_runtime': 1585.3042, 'train_samples_per_second': 20.185, 'train_steps_per_second': 2.523, 'total_flos': 4239259140096000.0, 'train_loss': 0.2622231330871582, 'epoch': 2.0})

In [None]:
model.save_pretrained("emotion_model")
tokenizer.save_pretrained("emotion_model")


('emotion_model/tokenizer_config.json',
 'emotion_model/special_tokens_map.json',
 'emotion_model/vocab.txt',
 'emotion_model/added_tokens.json',
 'emotion_model/tokenizer.json')

📘 PHASE 6: Inference (Single Sentence)

In [None]:
import torch
# Get label names from the dataset
label_names = tokenized_train_ds.features['label'].names

# Map from ID to label name
id2label = {i: label for i, label in enumerate(label_names)}

# Emotion label to emoji+name
emotion_labels = {
    "sadness": "😢 Sadness",
    "joy": "😊 Joy",
    "love": "❤️ Love",
    "anger": "😠 Anger",
    "fear": "😨 Fear",
    "surprise": "😲 Surprise"

}

# Example message for prediction
text = "I feel really happy and excited today!"

# Tokenize input
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)

pred_id = outputs.logits.argmax().item()
pred_label = id2label[pred_id]
pred_emotion = emotion_labels[pred_label]

print("Predicted Emotion:", pred_emotion)

Predicted Emotion: 😊 Joy


📘 PHASE 7: Build Streamlit App

In [None]:
%%writefile app.py
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Page settings
st.set_page_config(page_title="Emotion Detector", page_icon="🧠", layout="centered") # Reverted to set_page_config

# Load trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("emotion_model")
tokenizer = AutoTokenizer.from_pretrained("emotion_model")
model.eval()

# Define id2label based on the dataset used in training (from Phase 6)
id2label = {
    0: 'anger',
    1: 'fear',
    2: 'joy',
    3: 'love',
    4: 'sadness',
    5: 'surprise'
}

# Add emojis for each emotion (excluding neutral as it's not in the dataset)
emotion_emojis = {
    "sadness": "😢 Sadness",
    "joy": "😊 Joy",
    "love": "❤️ Love",
    "anger": "😠 Anger",
    "fear": "😨 Fear",
    "surprise": "😲 Surprise"
}


# App header
st.markdown("<h1 style='text-align: center; color: #4A90E2;'>🧠 Emotion Detection App</h1>", unsafe_allow_html=True)
st.markdown("<p style='text-align: center;'>Enter a message below to detect its emotional tone.</p>", unsafe_allow_html=True)

# Input form
with st.form(key='emotion_form'):
    user_input = st.text_area("📝 Your Message:", height=150)
    submit_button = st.form_submit_button(label="🔍 Detect Emotion")

if submit_button:
    if user_input.strip() == "":
        st.warning("Please enter some text.")
    else:
        # Tokenize input
        inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Predict
        with torch.no_grad():
            outputs = model(**inputs)
            # Ensure probabilities are calculated across the correct dimension (dim=1 for batch)
            probs = torch.nn.functional.softmax(outputs.logits, dim=1)

        pred_id = torch.argmax(probs, dim=1).item()
        pred_label = id2label[pred_id]
        confidence = probs[0][pred_id].item()

        # Get emotion text with emoji
        # Use get with a default to handle any unexpected labels, though id2label should prevent this.
        emotion_text_with_emoji = emotion_emojis.get(pred_label, f"❓ {pred_label.capitalize()}")


        # Result Display
        st.markdown(f"""
            <div style='background-color:#E6F4EA; padding:20px; border-radius:10px; text-align:center;'>
                <h2 style='color:#2E8B57;'>{emotion_text_with_emoji}</h2>
                <p style='font-size: 16px;'>Confidence: {confidence:.2%}</p>
            </div>
        """, unsafe_allow_html=True)

        # Bar chart for all emotion scores
        # Ensure we iterate through the correct number of labels and use the correct mapping
        score_dict = {id2label[i]: float(probs[0][i]) for i in range(len(id2label))} # Use len(id2label) for correct range
        st.markdown("### 🔍 Emotion Confidence Scores")
        st.bar_chart(score_dict)

Overwriting app.py


📘 PHASE 8: Run Streamlit App on Colab (ngrok)

In [None]:
from pyngrok import ngrok

# Add your ngrok authtoken here
ngrok.set_auth_token("")

!pkill streamlit
public_url = ngrok.connect("8501")
print("🌐 Streamlit App URL:", public_url)

!streamlit run app.py &

🌐 Streamlit App URL: NgrokTunnel: "https://bdb7cbff9f92.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.16.153.96:8501[0m
[0m
2025-07-31 11:30:15.152544: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753961415.176333   16591 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753961415.183474   16591 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
