In [2]:
import torch
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
from transformers import BertModel
import torch.nn as nn
import joblib


In [3]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# Load the label encoder
label_encoder = joblib.load('label_encoder.pkl')  # Replace with actual path to your saved label encoder

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
class BERT_LSTM_EmotionClassifier(nn.Module):
    def __init__(self, num_classes, hidden_dim=128, lstm_layers=1, bidirectional=True, dropout=0.3, word_weight_mapping=None, label_encoder=None):
        super(BERT_LSTM_EmotionClassifier, self).__init__()

        # Load the pre-trained BERT model
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')

        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=self.bert.config.hidden_size,
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            bidirectional=bidirectional,
            batch_first=True,
        )

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

        # Fully connected layer for classification
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, num_classes)

        # Store the word weight mapping and label encoder
        self.word_weight_mapping = word_weight_mapping if word_weight_mapping is not None else {}
        self.label_encoder = label_encoder

    def forward(self, input_ids, attention_mask, sentences=None):
        # Pass inputs through BERT
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_output.last_hidden_state

        # Pass BERT output through the LSTM
        lstm_output, (h_n, c_n) = self.lstm(sequence_output)

        if self.lstm.bidirectional:
            lstm_output = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            lstm_output = h_n[-1]

        # Apply dropout
        lstm_output = self.dropout(lstm_output)

        # Pass through the fully connected layer for classification
        logits = self.fc(lstm_output)

        # If sentences are provided, apply custom weights
        if sentences is not None:
            logits = self.apply_word_weights(logits, sentences)

        return logits

    def apply_word_weights(self, logits, sentences):
        for i, sentence in enumerate(sentences):
            for word, info in self.word_weight_mapping.items():
                if word in sentence:
                    emotion_label = info['label']
                    weight = info['weight']

                    # Map emotion label to its corresponding index
                    emotion_index = self.label_encoder.transform([emotion_label])[0]

                    # Apply the weight to the logits (consider adding rather than multiplying)
                    logits[i, emotion_index] += weight

        return logits


In [5]:
stop_words = set([
    'ನಾನು', 'ಅದು', 'ಅವರು', 'ಮತ್ತು', 'ಈ', 'ಇದು', 'ಎಂದು', 'ಆ', 'ಅದೇ', 'ಇದನ್ನು',
    'ನಾವು', 'ಅದನ್ನು', 'ನಿನ್ನ', 'ನನಗೆ', 'ಅವನು', 'ಅವಳು', 'ನಿಮ್ಮ', 'ಅವಳ', 'ಅವನ',
    'ನನ್ನ',  'ಮತ್ತು','ಅವರು','ಇಲ್ಲ','ನೀನು','ಹಾಗೂ','ಅವು','ಹಾಗೆ','ಅಲ್ಲಿ','ಇಲ್ಲಿ','ಇವು'
])

word_weight_mapping = {
    'ಸಂತೋಷ': {'label': 'joy', 'weight': 2.0},
    'ಹಾಸ್ಯ': {'label': 'joy', 'weight': 2.0},
    'ಖುಷಿ': {'label': 'joy', 'weight': 2.0},
    'ಉಲ್ಲಾಸ': {'label': 'joy', 'weight': 2.0},
    'ನಗು': {'label': 'joy', 'weight': 2.0},
    'ಭಯ': {'label': 'fear', 'weight': 2.0},
    'ಅಂಜಿಕೆ': {'label': 'fear', 'weight': 2.0},
    'ಹೆದರಿಕೆ': {'label': 'fear', 'weight': 2.0},
    'ಕ್ರೋಧ': {'label': 'anger', 'weight': 2.0},
    'ಸಿಟ್ಟು': {'label': 'anger', 'weight': 2.0},
    'ಕೋಪ': {'label': 'anger', 'weight': 2.0},
    'ದುಃಖ': {'label': 'sadness', 'weight': 2.0},
    'ಬೇಸರ': {'label': 'sadness', 'weight': 2.0},
    'ನಿರಾಸೆ': {'label': 'sadness', 'weight': 2.0},
    'ಆಶ್ಚರ್ಯ': {'label': 'surprise', 'weight': 2.0},
    'ಅಚ್ಚರಿ': {'label': 'surprise', 'weight': 2.0},
    'ವಿಸ್ಮಯ': {'label': 'surprise', 'weight': 2.0},
    'ಆಘಾತ': {'label': 'surprise', 'weight': 2.0},
    'ಬೆರಗು': {'label': 'surprise', 'weight': 2.0},
    'ಅಸಹ್ಯ': {'label': 'disgust', 'weight': 2.0},
    'ದ್ವೇಷ': {'label': 'disgust', 'weight': 2.0},
    'ದುರಾಸೆ': {'label': 'disgust', 'weight': 2.0},
    'ತಿರಸ್ಕಾರ': {'label': 'disgust', 'weight': 2.0}
}


In [14]:
import torch
import torch.nn.functional as F
import numpy as np
import re
from langdetect import detect, DetectorFactory, LangDetectException

DetectorFactory.seed = 0  # Ensures consistent language detection

def preprocess_text(text):
    try:
        # Detect the language of the entire text first
        if detect(text) != "kn":
            return ""  # Skip non-Kannada text entirely

        # Remove punctuation and special characters (except Kannada script)
        text = re.sub(r'[^\w\s\u0C80-\u0CFF]', '', text)

        # Remove newline characters
        text = re.sub(r'\n', ' ', text)

        # Remove words containing numbers
        text = re.sub(r'\w*\d\w*', '', text)

        # Split into words and filter English words
        words = text.split()

        # Handle Kannada negations
        negation_words = ['ಇಲ್ಲ', 'ವಾಗಲಿಲ್ಲ', 'ಅಲ್ಲ']
        for i, word in enumerate(words):
            if i > 0 and any(neg in word for neg in negation_words):
                final_words[-1] = "NOT_" + final_words[-1]
            else:
                final_words.append(word)

        # Remove stopwords (you need to define your `stop_words` list)
        final_words = [word for word in final_words if word not in stop_words]

        return ' '.join(final_words)
    except LangDetectException:
        # Handle cases where language detection fails
        return ""


In [7]:
model_path = 'best_model_1.pth'
model = BERT_LSTM_EmotionClassifier(num_classes=7)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()
# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BERT_LSTM_EmotionClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [11]:
def predict_emotion(text, threshold=0.2):
    delimiters = [".", "ಆದರೆ", "ಮತ್ತು"]
    parts = re.split("|".join(map(re.escape, delimiters)), text)
    parts = [part.strip() for part in parts if part.strip()]
    all_emotions = set()

    for sentence in parts:
        preprocessed_sentence = preprocess_text(sentence)
        if not preprocessed_sentence:
            continue  # Skip sentences not in Kannada

        inputs = tokenizer(preprocessed_sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        with torch.no_grad():
            output = model(input_ids, attention_mask)
            probabilities = torch.softmax(output, dim=1).squeeze()

        emotion_labels = label_encoder.classes_
        detected_emotions = [emotion_labels[idx] for idx, prob in enumerate(probabilities) if prob >= threshold]

        if len(detected_emotions) > 1:
            all_emotions.add("Mixed Emotion (" + ", ".join(detected_emotions) + ")")
        elif detected_emotions:
            all_emotions.add(detected_emotions[0])

    if len(all_emotions) > 1:
        return "Mixed Emotion (" + ", ".join(all_emotions) + ")"
    elif all_emotions:
        return "".join(all_emotions)

In [13]:
sentence = "ರಾತ್ರಿ ಹೊರಗೆ ಹೋಗುವುದಕ್ಕೆ ನನಗೆ ಬಹಳ Fear. ಆದರೆ ನನ್ನ ಸ್ನೇಹಿತನ ನಿರ್ಲಕ್ಷ್ಯದಿಂದ ನನಗೆ ಕೋಪ ಬಂದಿದೆ."  
predicted_label = predict_emotion(sentence)
print(f"Predicted Emotion Label: {predicted_label}")

Predicted Emotion Label: Mixed Emotion (Disgust, Anger)
