In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

In [2]:
train_df = pd.read_csv('tweets_train.csv')

In [3]:
test_df  = pd.read_csv('tweets_test.csv')

In [9]:
train_df.shape

(17000, 14)

In [7]:
train_df.head(2)

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,hashtags,source,is_retweet,clean_text,Sentiment
0,Romy 👑,Bolton - England,"Tables turn, bridges burn, you live and learn.",2009-06-15 09:00:39,525,896,3854,False,2020-09-19 15:19:32,['TheSocialDilemma'],Twitter for Android,False,TheSocialDilemma is an eye opener isn t it ple...,Neutral
1,TLynn Peterson,"Black Canyon City, Arizona",Acquired disability ♿ after an accident. Livin...,2013-05-29 00:17:46,5045,5374,48152,False,2020-09-13 00:31:46,['TheSocialDilemma'],Twitter for Android,False,TheSocialDilemma If we don t agree on what is ...,Positive


In [11]:
train_df.isnull().sum()

user_name              1
user_location       3607
user_description    1162
user_created           0
user_followers         0
user_friends           0
user_favourites        0
user_verified          0
date                   0
hashtags            3649
source                 0
is_retweet             0
clean_text             8
Sentiment              0
dtype: int64

#### Handle NaN values in the 'clean_text' column

In [12]:
train_df['clean_text'].fillna('', inplace=True)

In [22]:
test_df['clean_text'].fillna('', inplace=True)

#### Data Preprocessing

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['clean_text'])
X = tokenizer.texts_to_sequences(train_df['clean_text'])
X = pad_sequences(X, maxlen=50) 

#### Convert sentiment labels to numerical values

In [15]:
train_df['Sentiment'] = pd.Categorical(train_df['Sentiment'])
train_df['Sentiment'] = train_df['Sentiment'].cat.codes

In [16]:
y = train_df['Sentiment']

#### Split the data into training and validation sets

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#### Build the RNN Model

In [18]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=50),
    SimpleRNN(units=64, activation='relu'),
    Dense(units=3, activation='softmax')
])

#### Compile the Model

In [19]:
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#### Train the Model

In [20]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Evaluate the Model

In [21]:
accuracy = model.evaluate(X_val, y_val)[1]
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.8947058916091919


#### Make Predictions on Test Data

In [23]:
X_test = tokenizer.texts_to_sequences(test_df['clean_text'])
X_test = pad_sequences(X_test, maxlen=50)
predictions_test = model.predict(X_test)



In [24]:
predictions_test

array([[4.0645502e-04, 9.9920207e-01, 3.9148142e-04],
       [1.5633501e-04, 9.9975628e-01, 8.7328612e-05],
       [1.1103560e-01, 1.3936437e-02, 8.7502795e-01],
       ...,
       [1.5195258e-02, 9.4491333e-01, 3.9891444e-02],
       [1.3463365e-06, 3.5174850e-07, 9.9999833e-01],
       [3.1678092e-02, 1.5011103e-01, 8.1821090e-01]], dtype=float32)

#### Convert predictions to sentiment labels

In [25]:
predicted_labels = [np.argmax(prediction) for prediction in predictions_test]

#### Convert numerical labels back to original sentiment labels

In [26]:
predicted_sentiments = pd.Categorical.from_codes(predicted_labels, categories=['Negative', 'Neutral', 'Positive'])

#### Add predicted sentiments to the test_data DataFrame

In [28]:
test_df['Predicted_Sentiment'] = predicted_sentiments

In [30]:
# Print the results
print(test_df[['clean_text', 'Predicted_Sentiment']])

                                             clean_text Predicted_Sentiment
0              Watch theSocialDilemma then joinMastodon             Neutral
1     With the 2019 2020 NBA season officially over ...             Neutral
2     if you want to really know about people go tal...            Positive
3          Have you watched TheSocialDilemma on Netflix             Neutral
4                  Highly recommend TheSocialDilemma on            Positive
...                                                 ...                 ...
3063            Everybody should watch TheSocialDilemma             Neutral
3064            Enjoyed TheSocialDilemma Then read this            Negative
3065  Couldn t stop thinking about docu TheSocialDil...             Neutral
3066    This was the best one by on TheSocialDilemma on            Positive
3067                    TheSocialDilemma on Netflix wow            Positive

[3068 rows x 2 columns]
