In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adam

In [2]:
# Load the dataset (adjust the path if necessary)
df = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv", 
                 encoding="latin-1", header=None)  

In [3]:
# Display the first few rows
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
# Assign column names
df.columns = ["sentiment", "id", "date", "query", "user", "text"]

In [5]:
# Drop unnecessary columns
df = df[["sentiment", "text"]]

In [6]:
# Map sentiment labels to 0 (negative) and 1 (positive)
df["sentiment"] = df["sentiment"].map({0: 0, 4: 1})

In [7]:
# Check the dataset again
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [8]:
# Define stopwords
stop_words = set(stopwords.words('english'))

In [9]:
# Function to clean tweets
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    words = word_tokenize(text)  # Tokenize
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)

In [10]:
# Apply cleaning function to the text column
df["cleaned_text"] = df["text"].apply(clean_text)

In [11]:
# Check cleaned tweets
df.head()

Unnamed: 0,sentiment,text,cleaned_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",thats bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset cant update facebook texting might cry r...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cant see


In [12]:
# Hyperparameters
MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 50

In [13]:
# Tokenize the text
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")  # Out-of-vocabulary token
tokenizer.fit_on_texts(df["cleaned_text"])

In [14]:
# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(df["cleaned_text"])

In [15]:
# Apply padding to make sequences the same length
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

In [16]:
# Check output
print(f"Sample original text: {df['cleaned_text'].iloc[0]}")
print(f"Tokenized sequence: {sequences[0]}")
print(f"Padded sequence: {padded_sequences[0]}")

Sample original text: thats bummer shoulda got david carr third day
Tokenized sequence: [44, 1088, 3297, 8, 753, 9568, 1722, 4]
Padded sequence: [  44 1088 3297    8  753 9568 1722    4    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [17]:
# Convert labels to numpy array
labels = np.array(df["sentiment"])

In [18]:
# Split into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [19]:
# Check shapes
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")

Training set: (1280000, 50), (1280000,)
Validation set: (320000, 50), (320000,)


In [20]:
# Hyperparameters
EMBEDDING_DIM = 100  # Word vector size
LSTM_UNITS = 128  # LSTM cell size
DROPOUT_RATE = 0.2  # Dropout for regularization

In [21]:
# Tokenizer already fitted earlier, so get vocab size
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token
max_len = max(len(seq) for seq in X_train)  # Get max sequence length from training data

model = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=0.0005)  # Keep reduced LR for stability
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])



In [22]:
# Model summary
model.summary()

In [23]:
history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/15
[1m40000/40000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m834s[0m 21ms/step - accuracy: 0.5011 - loss: 0.6933 - val_accuracy: 0.4984 - val_loss: 0.6932
Epoch 2/15
[1m40000/40000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m826s[0m 21ms/step - accuracy: 0.4996 - loss: 0.6932 - val_accuracy: 0.4984 - val_loss: 0.6932
Epoch 3/15
[1m40000/40000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m826s[0m 21ms/step - accuracy: 0.4996 - loss: 0.6932 - val_accuracy: 0.5016 - val_loss: 0.6932
Epoch 4/15
[1m40000/40000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m826s[0m 21ms/step - accuracy: 0.5001 - loss: 0.6932 - val_accuracy: 0.4984 - val_loss: 0.6931
Epoch 5/15
[1m40000/40000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m827s[0m 21ms/step - accuracy: 0.4992 - loss: 0.6932 - val_accuracy: 0.5016 - val_loss: 0.6931
Epoch 6/15
[1m40000/40000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m828s[0m 21ms/step - accuracy: 0.4998 - loss: 0.6932 - val_accuracy: 0.5016 - val

In [24]:
# Predict on the validation set
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary (0 or 1)

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2ms/step


In [25]:
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.5016


In [26]:
# Detailed classification report
print("Classification Report:\n", classification_report(y_val, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00    159494
           1       0.50      1.00      0.67    160506

    accuracy                           0.50    320000
   macro avg       0.25      0.50      0.33    320000
weighted avg       0.25      0.50      0.34    320000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
