In [8]:
import pandas as pd
import re

# Load the Excel file
df = pd.read_excel("dataset_2_2.xlsx", header=None, names=["raw1"])

In [9]:
# Drop missing values
df.dropna(inplace=True)

In [14]:
# Split text and label at the last semicolon
df[['text1', 'label1']] = df['raw1'].str.rsplit(";", n=1, expand=True)

# Fill missing labels with empty strings
df.loc[:, 'label1'] = df['label1'].fillna('')

In [106]:
# Apply cleaning
df["clean_text1"] = df["text1"].apply(clean_tweet)
df = df.sample(n=25, random_state=42)

In [107]:
# Preview
print(df[["clean_text1", "label1"]].head())

                                            clean_text1    label1
65    i feel a little stunned but can t imagine what...  surprise
746   i feel a bit like a naughty kid who went and s...      love
1860  i feel so dirty but after spending a day at th...   sadness
1083  i found myself feeling nostalgic as i thought ...      love
1292   i do feel discouraged by what my supervisor said   sadness


In [108]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from datasets import Dataset, load_metric

In [109]:
# Prepare dataset1
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [110]:
# Encode texts
def tokenize(batch):
    return tokenizer(batch['clean_text1'], padding=True, truncation=True)

In [90]:
# Convert DataFrame to Dataset
dataset1 = Dataset.from_pandas(df[['clean_text1', 'label1']])

In [111]:
# Tokenize
dataset1 = dataset1.map(tokenize, batched=True)

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [118]:
# Split train/test
train_test = dataset1.train_test_split(test_size=0.2)

In [119]:
# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [121]:
# Training args
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [122]:
# Define metric
metric = load_metric('f1')

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return metric.compute(predictions=preds, references=p.label_ids, average='macro')

In [123]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test['train'],
    eval_dataset=train_test['test'],
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss


AttributeError: 'float' object has no attribute 'size'

In [125]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Predictions
preds_output = trainer.predict(train_test['test1'])
pred_labels = np.argmax(preds_output.predictions, axis=1)
true_labels = preds_output.label_ids

# Print metrics
print(classification_report(true_labels, pred_labels, target_names=labels))

# Confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(true_labels, pred_labels)
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


KeyError: 'test1'

In [126]:
model.save_pretrained("./emotion_bert_model")
tokenizer.save_pretrained("./emotion_bert_model")

('./emotion_bert_model/tokenizer_config.json',
 './emotion_bert_model/special_tokens_map.json',
 './emotion_bert_model/vocab.txt',
 './emotion_bert_model/added_tokens.json',
 './emotion_bert_model/tokenizer.json')

In [127]:
import streamlit as st
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F

@st.cache(allow_output_mutation=True)
def load_model():
    tokenizer = BertTokenizer.from_pretrained("./emotion_bert_model")
    model = BertForSequenceClassification.from_pretrained("./emotion_bert_model")
    model.eval()
    return tokenizer, model

tokenizer, model = load_model()

st.title("Tweet Emotion Detection")

user_input = st.text_area("Enter Tweet Text:")

if st.button("Predict Emotion"):
    inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
    pred_idx = torch.argmax(probs).item()
    pred_prob = probs[0][pred_idx].item()
    emotion = list(label2id.keys())[list(label2id.values()).index(pred_idx)]

    st.write(f"**Predicted Emotion:** {emotion}")
    st.write(f"**Confidence:** {pred_prob:.2f}")

    st.write("**Probabilities:**")
    for i, label in enumerate(label2id.keys()):
        st.write(f"{label}: {probs[0][i].item():.2f}")


2025-07-11 07:58:11.074 
`st.cache` is deprecated and will be removed soon. Please use one of Streamlit's new
caching commands, `st.cache_data` or `st.cache_resource`. More information
[in our docs](https://docs.streamlit.io/develop/concepts/architecture/caching).

**Note**: The behavior of `st.cache` was updated in Streamlit 1.36 to the new caching
logic used by `st.cache_data` and `st.cache_resource`. This might lead to some problems
or unexpected behavior in certain edge cases.

