In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, Input, Concatenate
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import re
import random
import pickle

In [2]:
# Load preprocessed CSV data
tweet_data = pd.read_csv(r'tweets.csv')  # Update with your data path
emoji_scores_data = pd.read_csv(r'Emoji-Sentiment-Data-v1.0.csv') 

In [3]:
# Select positive and negative rows
positive_rows = tweet_data[tweet_data['Response'] == 'positive']
negative_rows = tweet_data[tweet_data['Response'] == 'negative']

# Select a random sample of 1500 neutral rows
neutral_rows = tweet_data[tweet_data['Response'] == 'neutral'].sample(n=2000, random_state=1)  # You can change the random_state if desired

# Combine the selected rows into a new DataFrame
tweet_data = pd.concat([positive_rows, negative_rows, neutral_rows])


In [4]:
tweet_data

Unnamed: 0.1,Unnamed: 0,Tweet Id,Username,Content,No of likes,No of Retweets,No of Replies,No of quoteCount,Date,Time,Response
1,1,1.340000e+18,Bulama8976,HumilityEmpathyCourageVisionResilience and Acc...,13.0,2,1,2.0,30/12/2020,23:51:09,positive
5,5,1.340000e+18,malaminuu,The Statecritical policy priority is to build ...,5.0,5,0,0.0,30/12/2020,23:43:47,positive
6,6,1.340000e+18,busuyikk,AtikuSaraki and Kwankwanso may have been forgi...,0.0,0,1,0.0,30/12/2020,23:40:53,positive
10,10,1.340000e+18,aesha_m_dawood,Thank you 🙏🏾😊,0.0,0,0,0.0,30/12/2020,23:35:47,positive
16,16,1.340000e+18,AbbanHajiya7,Atiku Support Organization Aso Aso to asorock,4.0,0,0,0.0,30/12/2020,23:18:55,positive
...,...,...,...,...,...,...,...,...,...,...,...
4441,4445,1.340000e+18,MOyewola,rydayI am a Muslim but I donbelieve in Hell fi...,0.0,0,0,0.0,30/12/2020,19:25:28,neutral
8323,8327,1.330000e+18,nabrga,Bros u can go ahead n block me na ur phone n d...,2.0,0,0,0.0,22/11/2020,21:16:43,neutral
11372,11376,1.340000e+18,BonaNaija,You must contest in 2023 presidential election...,2.0,1,0,0.0,30/12/2020,9:56:47,neutral
9340,9344,1.340000e+18,Chude,I said it when Funke did Your Excellencythat t...,3.0,0,0,0.0,30/12/2020,18:28:52,neutral


In [5]:
tweet_data['Response'] = tweet_data['Response'].str.lower()

In [6]:
tweet_data = tweet_data[tweet_data['Response'] != 'neutral ']   

In [7]:
tweet_data['Response'].value_counts()

Response
negative    2050
neutral     2000
positive    1218
Name: count, dtype: int64

In [8]:
# Drop rows with null values
tweet_data = tweet_data.dropna(subset=['Content', 'Response'])
# Preprocess text data
X = tweet_data['Content']
y = tweet_data['Response']

In [9]:
# Encode response labels using label encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

In [10]:
# Perform one-hot encoding
onehot_encoder = OneHotEncoder(sparse_output=False)
y_onehot = onehot_encoder.fit_transform(y_encoded.reshape(-1, 1))

In [11]:
X = X.astype(str)  # Convert elements to strings

In [12]:
# Tokenization and Padding
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=max_sequence_length)

In [13]:
embeddings_index = {}
with open(r'glove.twitter.27B\glove.twitter.27B.100d.txt', encoding='utf8') as f:  # Update with the path to your GloVe file
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [14]:
# Create embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
emoji_scores = {}
for index, row in emoji_scores_data.iterrows():
    emoji_scores[row['Emoji']] = {
        'positive': row['Positive'],
        'neutral': row['Neutral'],
        'negative': row['Negative']
    }

In [16]:
def calculate_emoji_scores_for_tweets(tweet_data, emoji_scores):
    emoji_positive_scores_train = []

    for tweet in tweet_data:
        positive_scores = []  # List to store positive scores for each emoji in the tweet

        emojis = re.findall(r'[^\w\s,]', tweet)  # Extract emojis from the tweet

        for emoji_char in emojis:
            if emoji_char in emoji_scores:
                positive_score = emoji_scores[emoji_char]['positive']
                positive_scores.append(positive_score)

        # Calculate the average positive score for emojis in the tweet
        avg_positive_score = sum(positive_scores) / len(positive_scores) if positive_scores else 0.0
        emoji_positive_scores_train.append(avg_positive_score)

    return emoji_positive_scores_train

In [17]:

# Create model
text_input = Input(shape=(max_sequence_length,))
emoji_input = Input(shape=(1,))

# Use emoji positive scores as input to the model
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length, weights=[embedding_matrix], trainable=False)(text_input)
lstm_layer = Bidirectional(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))(embedding_layer)  # Reduced units
merged_layer = Concatenate()([lstm_layer, emoji_input])

output_layer = Dense(units=num_classes, activation='softmax', kernel_regularizer='l2')(merged_layer)  # Added L2 regularization
model = Model(inputs=[text_input, emoji_input], outputs=output_layer)



In [18]:
# Split into train, validation, and test sets while preserving class distribution
X_train, X_temp, y_train, y_temp = train_test_split(X, y_onehot, test_size=0.1, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.1, random_state=42)

In [19]:
# Extract tweet texts from X_train sequences
tweet_texts_train = tokenizer.sequences_to_texts(X_train)

In [20]:
# Calculate emoji scores for training tweets
emoji_positive_scores_train = calculate_emoji_scores_for_tweets(tweet_texts_train, emoji_scores)
# Calculate emoji scores for validation tweets
tweet_texts_val = tokenizer.sequences_to_texts(X_val)
emoji_positive_scores_val = calculate_emoji_scores_for_tweets(tweet_texts_val, emoji_scores)

In [21]:
# Calculate emoji scores for training and validation tweets
emoji_positive_scores_train = calculate_emoji_scores_for_tweets(tweet_texts_train, emoji_scores)
emoji_positive_scores_val = calculate_emoji_scores_for_tweets(tweet_texts_val, emoji_scores)

In [22]:
# Define callbacks for early stopping
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
]

In [23]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [24]:
# Train the model
history = model.fit([np.array(X_train), np.array(emoji_positive_scores_train)], np.array(y_train),
                    epochs=10, batch_size=32,
                    validation_data=([np.array(X_val), np.array(emoji_positive_scores_val)], np.array(y_val)),
                    callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
# Save the tokenizer using pickle
with open('tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

In [31]:
# Save the trained Keras model to an .h5 file
model.save('bal_data_model.keras')

In [32]:
# Save the emoji scores using pickle
with open('emoji_scores.pkl', 'wb') as emoji_scores_file:
    pickle.dump(emoji_scores, emoji_scores_file)