In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding


In [2]:
data = pd.read_csv('test.csv',encoding='latin-1')
print("Columns in the dataset:")
print(data.columns.tolist())


Columns in the dataset:
['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']


In [3]:
data['text'] = data['text'].str.lower()
data['text'] = data['text'].replace(r'[^a-z0-9\s]','',regex = True)

data['sentiment'] = data['sentiment'].apply(lambda x:0 if x == 'negative' else 1)
data = data.dropna()

In [5]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,2533
0,1001


In [6]:
maxf = 5000
maxl = 200

tokenizer = Tokenizer(num_words = maxf)
tokenizer.fit_on_texts(data['text'])
x = pad_sequences(tokenizer.texts_to_sequences(data['text']),maxlen = maxl)
y = data['sentiment'].values

In [8]:
x_train , x_test, y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=42,stratify=y)
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size = 0.1,random_state=42,stratify=y_train)

In [9]:
model = Sequential([
    Embedding(input_dim = maxf,output_dim = 16,input_length=maxf),
    SimpleRNN(64,activation='tanh',return_sequences=False),
    Dense(1,activation='sigmoid')
])
model.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)



In [11]:
history = model.fit(
    x_train, y_train,
    epochs=5,
    batch_size=32,
    validation_data=(x_val, y_val),
    verbose=1
)

score = model.evaluate(x_test, y_test, verbose=0)
print(f"Test accuracy: {score[1]:.2f}")


Epoch 1/5
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.9624 - loss: 0.1322 - val_accuracy: 0.3781 - val_loss: 0.9481
Epoch 2/5
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.8309 - loss: 0.3576 - val_accuracy: 0.7491 - val_loss: 0.5316
Epoch 3/5
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.9569 - loss: 0.1861 - val_accuracy: 0.7314 - val_loss: 0.5572
Epoch 4/5
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.9834 - loss: 0.1025 - val_accuracy: 0.7314 - val_loss: 0.6034
Epoch 5/5
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9875 - loss: 0.0592 - val_accuracy: 0.7314 - val_loss: 0.6648
Test accuracy: 0.73


In [15]:
def predict_sentiment(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)

    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=maxl)

    prediction = model.predict(padded)[0][0]
    return f"{'Positive' if prediction >= 0.5 else 'Negative'} (Probability: {prediction:.2f})"

text = "The food was great."
print(f"text: {text}")
print(f"Sentiment: {predict_sentiment(text)}")


text: The food was great.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
Sentiment: Positive (Probability: 0.82)
