In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding,Dropout
from tensorflow.keras.optimizers import Adam
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import dask.dataframe as dd
import kagglehub
import os

In [3]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [4]:
#!pip install dask[complete]  # Install Dask along with its required dependencies


In [None]:

# Download latest version
dataset_path = kagglehub.dataset_download("kazanova/sentiment140")
file_path = "D:/coding/Notebooks/Projects/data/training.1600000.processed.noemoticon.csv" #Change this to your path
# Find the CSV file within the downloaded directory
if os.path.isfile(file_path):
    print(f"{file_path} exists!")
    csv_file_path = file_path
else:
    print(f"{file_path} does not exist.")
    for filename in os.listdir(dataset_path):
        if filename.endswith(".csv"):
            csv_file_path = os.path.join(dataset_path, filename)
            break  # Stop after finding the first CSV file
print(f"CSV file found: {csv_file_path}")
# Now use the csv_file_path to read the data
data = pd.read_csv(csv_file_path, encoding='latin-1', header=None, names=["target", "id", "date", "meta", "user", "text"])
print("Columns in the dataset:")
print(data.columns.tolist())

D:/coding/Notebooks/Projects/data/training.1600000.processed.noemoticon.csv exists!
CSV file found: D:/coding/Notebooks/Projects/data/training.1600000.processed.noemoticon.csv
Columns in the dataset:
['target', 'id', 'date', 'meta', 'user', 'text']


In [19]:
data['sentiment'] = data['target']
data.drop(columns =["id", "date", "meta", "user"], inplace=True)

In [20]:
data = data.dropna()

In [None]:

data = dd.from_pandas(data, npartitions=16)
# Compile regex patterns once for speed
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

html_pattern = re.compile(r'<.*?>')  # HTML tags
special_pattern = re.compile(r'[^a-z\s]')  # Keep only a-z and spaces

lemmatizer = WordNetLemmatizer()

# --- Main cleaning function ---
def clean_text(text):
    text = text.lower()
    text = emoji_pattern.sub(r'', text)
    text = html_pattern.sub(r'', text)
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = special_pattern.sub(r'', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'http[s]?://\S+', '', text)

    #tokens = text.split()
    #tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Optional lemmatization
    #return ' '.join(tokens)
    return text

# --- For batch cleaning ---
data['text'] = data['text'].map(clean_text, meta=('x', 'object'))
cleaned_data = data.compute()

# Now you can work with the cleaned DataFrame (as a Pandas DataFrame)
print(cleaned_data.head())

In [None]:
cleaned_data.to_csv('data/data.csv', index=False)

In [28]:
data = pd.read_csv('data/data.csv')

In [29]:
data['sentiment'] = data['target'].apply(lambda x: 0 if x == 0 else (1 if x == 2 else 2))

In [30]:
data.tail()

Unnamed: 0,target,text,sentiment
1599995,4,just woke up having no school is the best feel...,2
1599996,4,thewdbcom very cool to hear old walt interview...,2
1599997,4,are you ready for your mojo makeover ask me fo...,2
1599998,4,happy th birthday to my boo of alll time tupac...,2
1599999,4,happy charitytuesday thenspcc sparkscharity sp...,2


In [31]:
max_features = 5000
max_length = 200

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data["text"])
X = pad_sequences(tokenizer.texts_to_sequences(data["text"]), maxlen=max_length)
y = data['sentiment'].values


In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
)


In [45]:
num_classes = 3  # Update this based on your use case

model = Sequential([
    Embedding(input_dim=max_features, output_dim=16),
    SimpleRNN(64, activation='tanh', return_sequences=True),  # Change return_sequences to True
    SimpleRNN(128, activation='tanh', return_sequences=True),
    SimpleRNN(64, activation='tanh', return_sequences=False),  # The last RNN layer does not return sequences
    Dense(num_classes, activation='softmax')  # Use softmax for multi-class classification
])

model.compile(
    loss='sparse_categorical_crossentropy', # Use categorical_crossentropy for multi-class classification
    optimizer=Adam(learning_rate = 0.001),
    metrics=['accuracy']
)



In [49]:
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=500,
    validation_data=(X_val, y_val),
    verbose=1
)

score = model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {score[1]:.2f}")



Epoch 1/50
[1m2304/2304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1809s[0m 785ms/step - accuracy: 0.8469 - loss: 0.3464 - val_accuracy: 0.8042 - val_loss: 0.4441
Epoch 2/50
[1m2304/2304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m635s[0m 275ms/step - accuracy: 0.8462 - loss: 0.3470 - val_accuracy: 0.8012 - val_loss: 0.4419
Epoch 3/50
[1m2304/2304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m633s[0m 275ms/step - accuracy: 0.8482 - loss: 0.3445 - val_accuracy: 0.8038 - val_loss: 0.4434
Epoch 4/50
[1m2304/2304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m629s[0m 273ms/step - accuracy: 0.8482 - loss: 0.3435 - val_accuracy: 0.7991 - val_loss: 0.4568
Epoch 5/50
[1m2304/2304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1046s[0m 454ms/step - accuracy: 0.8472 - loss: 0.3458 - val_accuracy: 0.8037 - val_loss: 0.4454
Epoch 6/50
[1m2304/2304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1340s[0m 582ms/step - accuracy: 0.8478 - loss: 0.3442 - val_accuracy: 0.8016 - val_lo

In [50]:
# Save the whole model (architecture + weights + optimizer state)
model.save('models/model_SentRnn.keras')

# To load it back:
model = load_model('models/model_SentRnn.keras')


In [52]:
# Define class labels according to your training
class_labels = ['Negative', 'Neutral', 'Positive']

def predict_sentiment(review_text):
    text = clean_text(review_text)  # Clean the input text

    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_length)

    prediction = model.predict(padded)[0]  # Now returns an array of probabilities
    predicted_class = prediction.argmax()  # Get the index of the highest probability
    confidence = prediction[predicted_class]  # Get the confidence of the predicted class

    return f"{class_labels[predicted_class]} (Probability: {confidence:.2f})"

# Example usage
sample_review = "The food was great."
print(f"Review: {sample_review}")
print(f"Sentiment: {predict_sentiment(sample_review)}")



Review: The food was great.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Sentiment: Positive (Probability: 0.97)
