In [35]:
# utilities
import re
import numpy as np
import pandas as pd

# nltk
from nltk.stem import WordNetLemmatizer

#SpellCorrection
from spellchecker import SpellChecker

import string
import emoji
import chardet

In [36]:
DATASET_COLUMNS = ['date', 'username', 'text', 'polarity', 'emotion']

#Detect file encoding using chardet
with open('data.csv', 'rb') as f:
    result = chardet.detect(f.read())

# Print the detected encoding
print("Detected encoding:", result['encoding'])

# Read the file using the detected encoding
df = pd.read_csv('data.csv', encoding=result['encoding'], names=DATASET_COLUMNS)
df.sample(5)

Detected encoding: UTF-8-SIG


Unnamed: 0,date,username,text,polarity,emotion
44,"5:39 PM · Oct 23, 2023",@realChokiie,2/2\n\n4) when ETF SPOT will be approved then ...,2,anticipation
276,"1:10 PM · Oct 24, 2023",@NaijaExcellence,👇🏽👇🏽It's only in crypto that you can be fearfu...,2,happy
1,"7:54 AM · Oct 26, 2023",@crypto_chin,$HAY bullflag breakout👀\n\nLets fill that wick🚀⏳,2,anticipation
118,"7:28 PM · Oct 25, 2023",@amonbuy,At some point nobody will be able to ignore th...,2,anticipation
53,"9:43 PM · Oct 23, 2023",@trendguards,🟢📈 Green vibes in the market today! 🚀💚\n\nIt's...,2,happy


In [37]:
#Data preprocessing
data=df[['text','polarity', 'emotion']]

In [42]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
dataset['text'] = dataset['text'].apply(lambda x: cleaning_numbers(x))
dataset['text'].tail()

582                Which project has strong community? 🚀
584    New Zealand Rapper Sesh and DogeCoin Millionai...
585    The founder of the bankrupt cryptocurrency exc...
586    Unlock the Future with .mmit Domains! Join ove...
595    If you sleep now you will have a dream but if ...
Name: text, dtype: object

In [43]:
emoticons_to_keep = [
    '💰', '📈', '🤣', '🎊', '😂', '😭', '🙁', '😞', '💔', '😢', '😮', '😵', '🙀',
    '😱', '❗', '😠', '😡', '😤', '👎', '🔪', '🌕', '🚀', '💎', '👀', '💭', '📉',
    '😨', '😩', '😰', '💸'
]

def clean_tweet(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove hashtags and mentions
    text = re.sub(r'@\w+|#\w+', '', text)

    # Remove special characters except for emoticons
    text = re.sub(r'[^\w\s.!?{}]+'.format(''.join(emoticons_to_keep)), '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

# Apply the modified cleaning function to the 'text' column in your dataset
dataset['text'] = dataset['text'].apply(clean_tweet)

# Display the 'text' column in the entire dataset
print(dataset['text'])

0      BTC ON GLP RESISTANCE FOR NOW PLAY SAFE IF U R...
1            HAY bullflag breakout👀 Lets fill that wick🚀
2      Did you guys see how is doing a pitch with a d...
3      GN Fam going early to bed been up since or AM ...
4      You think this week has been fun?!? 😂😂😂😂 Wait ...
                             ...                        
582                Which project has strong community? 🚀
584    New Zealand Rapper Sesh and DogeCoin Millionai...
585    The founder of the bankrupt cryptocurrency exc...
586    Unlock the Future with .mmit Domains! Join ove...
595    If you sleep now you will have a dream but if ...
Name: text, Length: 347, dtype: object


In [44]:
# Initialize SpellChecker only once to avoid re-creation for each call
spell = SpellChecker()

# Function for spell correction
def spell_correction(text):
    words = text.split()
    misspelled = spell.unknown(words)
    corrected_words = []
    for word in words:
        if word in misspelled:
            corrected_word = spell.correction(word)
            # Check if the correction is not None, otherwise use the original word
            corrected_words.append(corrected_word if corrected_word is not None else word)
        else:
            corrected_words.append(word)
    return ' '.join(corrected_words)

# Apply spell correction to the entire 'text' column
dataset['text'] = dataset['text'].apply(spell_correction)

# Display the entire dataset
print(dataset)

                                                  text  polarity       emotion
0    BTC ON GLP RESISTANCE FOR NOW PLAY SAFE IF U R...         2         happy
1            HAY bullfrog breakout Lets fill that wick         2  anticipation
2    Did you guys see how is doing a pitch with a d...         2         happy
3    GN Fam going early to bed been up since or AM ...         2         happy
4    You think this week has been fun?!? 😂😂😂😂 Wait ...         2  anticipation
..                                                 ...       ...           ...
582               Which project has strong community i         1         happy
584  New Zealand Rapper Sesh and DogeCoin Millionai...         1         happy
585  The founder of the bankrupt cryptocurrency exc...         1         happy
586  Unlock the Future with emmit Domains! Join ove...         1         happy
595  If you sleep now you will have a dream but if ...         1         happy

[347 rows x 3 columns]


In [45]:
#Define the emoticon dictionary outside the function for a wider scope
emoticon_dict = {
    "🌈": "Rainbow",
    "🌙": "Crescent Moon",
    "🌚": "New Moon Face",
    "🌞": "Sun with Face",
    "🌟": "Glowing Star",
    "🌷": "Tulip",
    "🌸": "Cherry Blossom",
    "🌹": "Rose",
    "🌺": "Hibiscus",
    "🍀": "Four Leaf Clover",
    "🍕": "Pizza",
    "🍻": "Clinking Beer Mugs",
    "🎀": "Ribbon",
    "🎈": "Balloon",
    "🎉": "Party Popper",
    "🎤": "Microphone",
    "🎥": "Movie Camera",
    "🎧": "Headphone",
    "🎵": "Musical Note",
    "🎶": "Musical Notes",
    "👀": "Eyes",
    "👅": "Tongue",
    "👇": "Backhand Index Pointing Down",
    "👈": "Backhand Index Pointing Left",
    "👉": "Backhand Index Pointing Right",
    "👋": "Waving Hand",
    "👌": "OK Hand",
    "👍": "Thumbs Up",
    "👏": "Clapping Hands",
    "👑": "Crown",
    "💀": "Skull",
    "💁": "Person Tipping Hand",
    "💃": "Woman Dancing",
    "💋": "Kiss Mark",
    "💎": "Gem Stone",
    "💐": "Bouquet",
    "💓": "Beating Heart",
    "💕": "Two Hearts",
    "💖": "Sparkling Heart",
    "💗": "Growing Heart",
    "💘": "Heart with Arrow",
    "💙": "Blue Heart",
    "💚": "Green Heart",
    "💛": "Yellow Heart",
    "💜": "Purple Heart",
    "💞": "Revolving Hearts",
    "💤": "Zzz",
    "💥": "Collision",
    "💦": "Sweat Droplets",
    "💪": "Flexed Biceps",
    "💫": "Dizzy",
    "💯": "Hundred Points",
    "💰": "Money Bag",
    "📷": "Camera",
    "🔥": "Fire",
    "😀": "Grinning Face",
    "😁": "Beaming Face with Smiling Eyes",
    "😂": "Face with Tears of Joy",
    "😃": "Grinning Face with Big Eyes",
    "😄": "Grinning Face with Smiling Eyes",
    "😅": "Grinning Face with Sweat",
    "😆": "Grinning Squinting Face",
    "😇": "Smiling Face with Halo",
    "😈": "Smiling Face with Horns",
    "😉": "Winking Face",
    "😊": "Smiling Face with Smiling Eyes",
    "😋": "Face Savoring Food",
    "😌": "Relieved Face",
    "😍": "Smiling Face with Heart-Eyes",
    "😎": "Smiling Face with Sunglasses",
    "😏": "Smirking Face",
    "😺": "Smiling Cat with Smiling Eyes",
    "😻": "Smiling Cat with Heart-Eyes",
    "😽": "Kissing Cat with Closed Eyes",
    "🙀": "Weary Cat",
    "🙏": "Folded Hands",
    "☀": "Sun",
    "☺": "Smiling Face",
    "♥": "Heart Suit",
    "✅": "Check Mark Button",
    "✈": "Airplane",
    "✊": "Raised Fist",
    "✋": "Raised Hand",
    "✌": "Victory Hand",
    "✔": "Check Mark",
    "✨": "Sparkles",
    "❄": "Snowflake",
    "❤": "Red Heart",
    "⭐": "Star",
    "😢": "Crying Face",
    "😭": "Loudly Crying Face",
    "😞": "Disappointed Face",
    "😟": "Worried Face",
    "😠": "Angry Face",
    "😡": "Pouting Face",
    "😔": "Pensive Face",
    "😕": "Confused Face",
    "😖": "Confounded Face",
    "😨": "Fearful Face",
    "😩": "Weary Face",
    "😪": "Sleepy Face",
    "😫": "Tired Face",
    "😰": "Anxious Face with Sweat",
    "😱": "Face Screaming in Fear",
    "😳": "Flushed Face",
    "😶": "Face Without Mouth",
    "😷": "Face with Medical Mask",
    "👊": "Oncoming Fist",
    "👎": "Thumbs Down",
    "❌": "Cross Mark",
    "😲": "Astonished Face",
    "😯": "Hushed Face",
    "😮": "Face with Open Mouth",
    "😵": "Dizzy Face",
    "🙊": "Speak-No-Evil Monkey",
    "🙉": "Hear-No-Evil Monkey",
    "🙈": "See-No-Evil Monkey",
    "💭": "Thought Balloon",
    "❗": "Exclamation Mark",
    "⚡": "High Voltage",
    "🎊": "Confetti Ball",
    "🙁": "Slightly frowning face",
    "💔": "Broken Heart",
    "😤": "Face with Steam from Nose",
    "🔪": "Hocho",
    "🌕": "Full Moon",
    "🚀": "Rocket",
    "📉": "Down Trend",
    "🤣": "Rolling on the Floor Laughing",
    "💸": "Money with Wings"
}

# Emoticon to word conversion function
def convert_emoticons_to_words(text):
    changed_emoticons = 0  # Variable to count the number of changed emoticons
    for emoticon, word in emoticon_dict.items():
        while emoticon in text:
            text = text.replace(emoticon, word + " ", 1)
            changed_emoticons += 1
    return text, changed_emoticons

# Apply the function and count emoticons for each row
def apply_conversion(text):
    converted_text, count = convert_emoticons_to_words(text)
    return pd.Series([converted_text, count], index=['converted_text', 'emoticons_count'])

conversion_results = dataset['text'].apply(apply_conversion)
dataset['converted_text'] = conversion_results['converted_text']
dataset['emoticons_count'] = conversion_results['emoticons_count']
print("Emoticons converted to words in 'converted_text' column.")
print(dataset[['converted_text', 'emoticons_count']].head())

Emoticons converted to words in 'converted_text' column.
                                      converted_text  emoticons_count
0  BTC ON GLP RESISTANCE FOR NOW PLAY SAFE IF U R...                0
1          HAY bullfrog breakout Lets fill that wick                0
2  Did you guys see how is doing a pitch with a d...                0
3  GN Fam going early to bed been up since or AM ...                0
4  You think this week has been fun?!? Face with ...               13


In [46]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [47]:
# Stopwords removal applied separately after the option has been chosen and processed
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# Apply the stopwords cleaning after the loop, once the 'text' column has been updated accordingly
dataset['text'] = dataset['converted_text'].apply(cleaning_stopwords)
print("Stopwords removed from 'text' column.")
print(dataset['text'].head())

Stopwords removed from 'text' column.
0    BTC ON GLP RESISTANCE FOR NOW PLAY SAFE IF U R...
1                 HAY bullfrog breakout Lets fill wick
2    Did guys see pitch deck reaching community Tha...
3    GN Fam going early bed since AM morning nonsto...
4    You think week fun?!? Face Tears Joy Face Tear...
Name: text, dtype: object


In [48]:
# Function to clean repeating words
def cleaning_repeating_words(text):
    # This regex pattern targets whole words that are repeated
    return re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

# Assuming 'dataset' is a pandas DataFrame and 'text' is a column in it
# Apply the cleaning function for repeating words to each row in the 'text' column
dataset['text'] = dataset['text'].apply(cleaning_repeating_words)
print("Repeating words cleaned from 'text' column.")
print(dataset['text'].head())

Repeating words cleaned from 'text' column.
0    BTC ON GLP RESISTANCE FOR NOW PLAY SAFE IF U R...
1                 HAY bullfrog breakout Lets fill wick
2    Did guys see pitch deck reaching community Tha...
3    GN Fam going early bed since AM morning nonsto...
4    You think week fun?!? Face Tears Joy Face Tear...
Name: text, dtype: object


In [49]:
dataset['text']=dataset['text'].str.lower()
dataset['text'].head()

0    btc on glp resistance for now play safe if u r...
1                 hay bullfrog breakout lets fill wick
2    did guys see pitch deck reaching community tha...
3    gn fam going early bed since am morning nonsto...
4    you think week fun?!? face tears joy face tear...
Name: text, dtype: object

In [50]:
from nltk.tokenize import RegexpTokenizer

# The pattern matches word characters (\w) and punctuation marks ([^\w\s])
tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')

# Applying the modified tokenizer to the dataset
dataset['text'] = dataset['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
dataset['text'] = dataset['text'].apply(tokenizer.tokenize)
dataset['text'].head()

0    [btc, on, glp, resistance, for, now, play, saf...
1          [hay, bullfrog, breakout, lets, fill, wick]
2    [did, guys, see, pitch, deck, reaching, commun...
3    [gn, fam, going, early, bed, since, am, mornin...
4    [you, think, week, fun, ?, !, ?, face, tears, ...
Name: text, dtype: object

In [51]:
import nltk
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
dataset['text']= dataset['text'].apply(lambda x: stemming_on_text(x))
dataset['text'].head()

0    [btc, on, glp, resistance, for, now, play, saf...
1          [hay, bullfrog, breakout, lets, fill, wick]
2    [did, guys, see, pitch, deck, reaching, commun...
3    [gn, fam, going, early, bed, since, am, mornin...
4    [you, think, week, fun, ?, !, ?, face, tears, ...
Name: text, dtype: object

In [52]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
dataset['text'] = dataset['text'].apply(lambda x: lemmatizer_on_text(x))
dataset['text'].head()

0    [btc, on, glp, resistance, for, now, play, saf...
1          [hay, bullfrog, breakout, lets, fill, wick]
2    [did, guys, see, pitch, deck, reaching, commun...
3    [gn, fam, going, early, bed, since, am, mornin...
4    [you, think, week, fun, ?, !, ?, face, tears, ...
Name: text, dtype: object

In [59]:
import numpy as np
from keras.models import Sequential, Model
from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from keras.regularizers import l2
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import joblib

# Assume dataset is loaded and has columns 'text', 'polarity', and 'emotion'

# Initialize the tokenizer with your dataset
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset['text'])

# Preparing the dataset for training
sequences = tokenizer.texts_to_sequences(dataset['text'])
X = pad_sequences(sequences, maxlen=100)  # Increase maxlen if needed

# Convert labels to one-hot encoding
encoder_polarity = LabelEncoder()
y_polarity = to_categorical(encoder_polarity.fit_transform(dataset['polarity']))

encoder_emotion = LabelEncoder()
y_emotion = to_categorical(encoder_emotion.fit_transform(dataset['emotion']))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_polarity, test_size=0.2, random_state=42)

# Create LSTM model
def create_lstm_model(input_length, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=input_length))
    model.add(Bidirectional(LSTM(64, return_sequences=False)))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.01)))
    
    optimizer = Adam(learning_rate=0.0001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

lstm_model = create_lstm_model(100, y_train.shape[1])

# Early Stopping and Model Checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('best_lstm_model.h5', save_best_only=True, monitor='val_loss')

# Train the LSTM model
lstm_model.fit(
    X_train, y_train, 
    epochs=20, 
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

# Load the best model
lstm_model.load_weights('best_lstm_model.h5')

# Extract features for SVM training
intermediate_layer_model = Model(inputs=lstm_model.input, outputs=lstm_model.layers[-3].output)
X_train_features = intermediate_layer_model.predict(X_train)

# Normalize features
scaler = StandardScaler().fit(X_train_features)
X_train_features = scaler.transform(X_train_features)

# Grid Search for SVM
param_grid = {
    'C': [1, 10, 100],
    'gamma': ['scale'],
    'kernel': ['linear']
}
grid_search = GridSearchCV(SVC(probability=True), param_grid, refit=True, verbose=3)
grid_search.fit(X_train_features, np.argmax(y_train, axis=1))

# Best SVM estimator
svm_classifier = grid_search.best_estimator_

# Evaluate SVM on test set
X_test_features = scaler.transform(intermediate_layer_model.predict(X_test))
y_pred = svm_classifier.predict(X_test_features)
print("Classification Report:")
print(classification_report(np.argmax(y_test, axis=1), y_pred))

# Save SVM model
joblib.dump(svm_classifier, 'svm_model.joblib')


Epoch 1/20
Epoch 2/20


  saving_api.save_model(


Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=0.804 total time=   0.0s
[CV 2/5] END ...C=1, gamma=scale, kernel=linear;, score=0.875 total time=   0.0s
[CV 3/5] END ...C=1, gamma=scale, kernel=linear;, score=0.909 total time=   0.0s
[CV 4/5] END ...C=1, gamma=scale, kernel=linear;, score=0.745 total time=   0.0s
[CV 5/5] END ...C=1, gamma=scale, kernel=linear;, score=0.709 total time=   0.0s
[CV 1/5] END ..C=10, gamma=scale, kernel=linear;, score=0.804 total time=   0.0s
[CV 2/5] END ..C=10, gamma=scale, kernel=linear;, score=0.857 total time=   0.0s
[CV 3/5] END ..C=10, gamma=scale, kernel=linear;, score=0.927 total time=   0.0s
[CV 4/5] END ..C=10, gamma=scale, kernel=linear;, score=0.745 total time=   0.0s
[C

['svm_model.joblib']

In [63]:
from sklearn.metrics import classification_report

# ...

print("Detailed classification report:")
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=encoder_polarity.classes_))


Detailed classification report:


ValueError: Number of classes, 2, does not match size of target_names, 3. Try specifying the labels parameter

In [60]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Make predictions on the test set for polarity
y_pred_polarity = lstm_model_polarity.predict(X_test_polarity)
# Convert predictions from one-hot encoded to label encoded for evaluation
y_pred_polarity = np.argmax(y_pred_polarity, axis=1)
# Convert ground truth from one-hot encoded to label encoded for evaluation
y_true_polarity = np.argmax(y_test_polarity, axis=1)

# Calculate precision, recall, and F-measure for polarity
precision_polarity = precision_score(y_true_polarity, y_pred_polarity, average='weighted')
recall_polarity = recall_score(y_true_polarity, y_pred_polarity, average='weighted')
f1_score_polarity = f1_score(y_true_polarity, y_pred_polarity, average='weighted')

print(f'Polarity Precision: {precision_polarity:.4f}')
print(f'Polarity Recall: {recall_polarity:.4f}')
print(f'Polarity F1 Score: {f1_score_polarity:.4f}')

Polarity Precision: 0.3951
Polarity Recall: 0.6286
Polarity F1 Score: 0.4852


  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
# Make predictions on the test set for emotion
y_pred_emotion = lstm_model_emotion.predict(X_test_emotion)
# Convert predictions from one-hot encoded to label encoded for evaluation
y_pred_emotion = np.argmax(y_pred_emotion, axis=1)
# Convert ground truth from one-hot encoded to label encoded for evaluation
y_true_emotion = np.argmax(y_test_emotion, axis=1)

# Calculate precision, recall, and F-measure for emotion
precision_emotion = precision_score(y_true_emotion, y_pred_emotion, average='weighted')
recall_emotion = recall_score(y_true_emotion, y_pred_emotion, average='weighted')
f1_score_emotion = f1_score(y_true_emotion, y_pred_emotion, average='weighted')

print(f'Emotion Precision: {precision_emotion:.4f}')
print(f'Emotion Recall: {recall_emotion:.4f}')
print(f'Emotion F1 Score: {f1_score_emotion:.4f}')

Emotion Precision: 0.2359
Emotion Recall: 0.4857
Emotion F1 Score: 0.3176


  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
# Assuming `polarity_labels` is your list of original labels for the training data
# Example list of polarity labels used in your training dataset
polarity_labels = ['positive', 'negative', 'neutral']  # This should be replaced by the actual labels you have

# Initialize the LabelEncoder
encoder_polarity = LabelEncoder()

# Fit the LabelEncoder with your actual labels
encoder_polarity.fit(polarity_labels)

# Assuming encoder is a pre-defined LabelEncoder object for decoding the emotion labels

# This function should extract LSTM features
def extract_features(model, sequence):
    intermediate_layer_model = Model(inputs=model.input, outputs=model.layers[-2].output)
    return intermediate_layer_model.predict(sequence)

# Function to classify the intensity
def classify_intensity(emoticons_count, text):
    question_marks = text.count('?')
    periods = text.count('.')
    exclamation_marks = text.count('!')

    if exclamation_marks > 1 or question_marks > 1 or emoticons_count > 1:
        return 'High'
    elif periods == 1 or question_marks == 1 or emoticons_count == 1 or exclamation_marks ==1 :
        return 'Medium'
    elif question_marks == 0 and emoticons_count == 0:
        return 'Low'
    else:
        return 'Undetermined'

def tokenize_text(text):
    tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
    return ' '.join(tokenizer.tokenize(text))

#Function to perform real-time prediction and intensity classification
def real_time_prediction(text, tokenizer, lstm_model_emotion, lstm_model_polarity, svm_classifier_emotion, svm_classifier_polarity, encoder_emotion, encoder_polarity):
    # Preprocessing steps (assuming these functions are defined elsewhere in your code)


    cleaned_text = cleaning_numbers(text)
    cleaned_tweet = clean_tweet(cleaned_text)
    corrected_text = spell_correction(cleaned_tweet)
    emoticon_converted_text, emoticons_count = convert_emoticons_to_words(corrected_text)  # Ensure this function returns emoticons_count
    cleaned_stopwords = cleaning_stopwords(emoticon_converted_text)
    cleaned_repeating_words = cleaning_repeating_words(cleaned_stopwords)

    # Now tokenize the text after cleaning repeating words
    tokenized_text = tokenize_text(cleaned_repeating_words)

    # Continue with any additional preprocessing steps that work on the tokenized text
    stemmed_text = stemming_on_text(tokenized_text)
    lemmatized_text = lemmatizer_on_text(stemmed_text)

    assert isinstance(lemmatized_text, str), "Processed text must be a string"

    # Convert the processed text to a sequence
    sequence = tokenizer.texts_to_sequences([lemmatized_text])
    padded_sequence = pad_sequences(sequence, maxlen=50)


    # Predict emotion and polarity using the LSTM model
    lstm_prediction_emotion = lstm_model_emotion.predict(padded_sequence)
    lstm_features_emotion = extract_features(lstm_model_emotion, padded_sequence)
    svm_prediction_emotion = svm_classifier_emotion.predict_proba(lstm_features_emotion)

    lstm_prediction_polarity = lstm_model_polarity.predict(padded_sequence)
    lstm_features_polarity = extract_features(lstm_model_polarity, padded_sequence)
    svm_prediction_polarity = svm_classifier_polarity.predict_proba(lstm_features_polarity)

    # Decode the predicted labels
    emotion_label = encoder_emotion.inverse_transform(np.argmax(lstm_prediction_emotion, axis=1))
    polarity_label = encoder_polarity.inverse_transform(np.argmax(svm_prediction_polarity, axis=1))

    # Get probabilities for the predicted labels
    emotion_probability = np.max(lstm_prediction_emotion, axis=1)
    polarity_probability = np.max(svm_prediction_polarity, axis=1)

    # Classify the intensity
    intensity = classify_intensity(emoticons_count, text)  # Ensure `emoticons_count` is defined

    return polarity_label, emotion_label, polarity_probability, emotion_probability, intensity

#This is the real time tweets
tweet = "I'm angry"    
# Call the real-time prediction function
polarity_label, emotion_label, polarity_probability, emotion_probability, intensity = real_time_prediction(tweet, tokenizer, lstm_model_emotion, lstm_model_polarity, svm_classifier_emotion, svm_classifier_polarity, encoder_emotion, encoder_polarity)

# Print the results
print(f"Tweet: {tweet}")
print(f"Polarity Label: {polarity_label[0]}")
print(f"Emotion Label: {emotion_label[0]}")
print(f"Intensity Level: {intensity}")

Tweet: I'm angry
Polarity Label: neutral
Emotion Label: happy
Intensity Level: Low
