In [25]:
import pandas as pd

## Data Cleaning
### training data

In [26]:
df = pd.read_csv("/content/twitter_training.csv")

In [27]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [28]:
df.columns = ['A', 'B','status', 'review']

In [29]:
df.head()

Unnamed: 0,A,B,status,review
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [30]:
df = df.drop(columns=df[['A','B']], axis=1)

In [31]:
df.head()

Unnamed: 0,status,review
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [32]:
df['status'] = df['status'].map({
    'Negative': 0,
    'Positive': 1,
    'Neutral': 2,
    'Irrelevant': 3
})

In [33]:
df.head()

Unnamed: 0,status,review
0,1,I am coming to the borders and I will kill you...
1,1,im getting on borderlands and i will kill you ...
2,1,im coming on borderlands and i will murder you...
3,1,im getting on borderlands 2 and i will murder ...
4,1,im getting into borderlands and i can murder y...


In [34]:
df['status'].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
0,22542
1,20831
2,18318
3,12990


In [36]:
df.isnull().sum()

Unnamed: 0,0
status,0
review,686


In [37]:
df = df.dropna()

In [38]:
df.isnull().sum()

Unnamed: 0,0
status,0
review,0


### For Validation data


In [35]:
val = pd.read_csv("/content/twitter_validation.csv")

In [39]:
val.columns = ['A', 'B','status', 'review']
val = val.drop(columns=val[['A','B']], axis=1)
val = val.dropna()
val['status'] = val['status'].map({
    'Negative': 0,
    'Positive': 1,
    'Neutral': 2,
    'Irrelevant': 3
})


In [40]:
val.head()

Unnamed: 0,status,review
0,2,BBC News - Amazon boss Jeff Bezos rejects clai...
1,0,@Microsoft Why do I pay for WORD when it funct...
2,0,"CSGO matchmaking is so full of closet hacking,..."
3,2,Now the President is slapping Americans in the...
4,0,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [41]:
val.isnull().sum()

Unnamed: 0,0
status,0
review,0


In [42]:
x_train = df['review']
y_train = df['status']
x_test = val['review']
y_test = val['status']

In [43]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import re


In [45]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl.metadata (5.4 kB)
Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/431.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m430.1/431.4 kB[0m [31m14.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.12.1


In [46]:
import emoji

# Clean the text and remove emojis
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', emoji.replace_emoji(text, replace=''))
    text = text.lower()
    return text


In [47]:
# Apply the cleaning function
x_train = x_train.apply(clean_text)
x_test = x_test.apply(clean_text)


In [48]:
# Convert labels to categorical (for multi-class classification)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=4)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=4)

In [49]:
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(x_train)

In [50]:
import pickle

In [51]:
# Save the tokenizer to a file
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [52]:

# Convert the text data to sequences
X_train_seq = tokenizer.texts_to_sequences(x_train)
X_test_seq = tokenizer.texts_to_sequences(x_test)

# Pad the sequences to ensure uniform input size
max_length = max(len(x) for x in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

## Model Architecture

In [53]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [54]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length))
model.add(SimpleRNN(128, return_sequences=False))
model.add(Dense(4, activation='softmax'))  # 4 output units for the 4 classes

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])




#####  Handle Imbalanced Data (Optional)

In [55]:
from sklearn.utils.class_weight import compute_class_weight

In [56]:

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(df['status']), y=df['status'])
class_weights = dict(enumerate(class_weights))


In [59]:
with tf.device('/GPU:0'):
    model.fit(X_train_pad, np.array(y_train), epochs=5, batch_size=2, validation_split=0.2, class_weight=class_weights)


Epoch 1/5
[1m29598/29598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m601s[0m 20ms/step - accuracy: 0.2459 - loss: 1.4327 - val_accuracy: 0.1391 - val_loss: 1.4222
Epoch 2/5
[1m29598/29598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 22ms/step - accuracy: 0.2497 - loss: 1.4311 - val_accuracy: 0.2356 - val_loss: 1.4117
Epoch 3/5
[1m29598/29598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m681s[0m 21ms/step - accuracy: 0.2430 - loss: 1.4340 - val_accuracy: 0.3293 - val_loss: 1.3613
Epoch 4/5
[1m29598/29598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m685s[0m 22ms/step - accuracy: 0.2522 - loss: 1.4274 - val_accuracy: 0.2256 - val_loss: 1.4939
Epoch 5/5
[1m29598/29598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m683s[0m 22ms/step - accuracy: 0.2465 - loss: 1.4326 - val_accuracy: 0.2470 - val_loss: 1.4272


In [60]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, np.array(y_test))
print(f'Test Loss: {loss:.3f}, Test Accuracy: {accuracy:.3f}')

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - accuracy: 0.2934 - loss: 1.4108
Test Loss: 1.420, Test Accuracy: 0.281


In [61]:
# Save the model
model.save('sentiment_rnn_model.h5')



## Make prediction

In [62]:
def predict_sentiment(model, tokenizer, sentence, max_length):
    sentence = clean_text(sentence)
    seq = tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    prediction = model.predict(padded)
    predicted_class = np.argmax(prediction)

    label_map = {0: 'Negative', 1: 'Positive', 2: 'Neutral', 3: 'Irrelevant'}
    return label_map[predicted_class]

sample_sentence = "I'm getting on borderlands and I will kill you 😊"
print(predict_sentiment(model, tokenizer, sample_sentence, max_length))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 319ms/step
Positive
