In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [2]:
# Load the dataset (adjust the path if necessary)
df = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv", 
                 encoding="latin-1", header=None)  

In [3]:
# Display the first few rows
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
# Assign column names
df.columns = ["sentiment", "id", "date", "query", "user", "text"]

In [5]:
# Drop unnecessary columns
df = df[["sentiment", "text"]]

In [6]:
# Map sentiment labels to 0 (negative) and 1 (positive)
df["sentiment"] = df["sentiment"].map({0: 0, 4: 1})

In [7]:
# Check the dataset again
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [8]:
# Define stopwords
stop_words = set(stopwords.words('english'))

In [9]:
# Function to clean tweets
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    words = word_tokenize(text)  # Tokenize
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)

In [10]:
# Apply cleaning function to the text column
df["cleaned_text"] = df["text"].apply(clean_text)

In [11]:
# Check cleaned tweets
df.head()

Unnamed: 0,sentiment,text,cleaned_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",thats bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset cant update facebook texting might cry r...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cant see


In [12]:
# Hyperparameters
MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 50

In [13]:
# Tokenize the text
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")  # Out-of-vocabulary token
tokenizer.fit_on_texts(df["cleaned_text"])

In [14]:
# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(df["cleaned_text"])

In [15]:
# Apply padding to make sequences the same length
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

In [16]:
# Check output
print(f"Sample original text: {df['cleaned_text'].iloc[0]}")
print(f"Tokenized sequence: {sequences[0]}")
print(f"Padded sequence: {padded_sequences[0]}")

Sample original text: thats bummer shoulda got david carr third day
Tokenized sequence: [44, 1088, 3297, 8, 753, 9568, 1722, 4]
Padded sequence: [  44 1088 3297    8  753 9568 1722    4    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [17]:
# Convert labels to numpy array
labels = np.array(df["sentiment"])

In [18]:
# Split into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [19]:
# Check shapes
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")

Training set: (1280000, 50), (1280000,)
Validation set: (320000, 50), (320000,)


In [20]:
# Hyperparameters
EMBEDDING_DIM = 100  # Word vector size
LSTM_UNITS = 128  # LSTM cell size
DROPOUT_RATE = 0.2  # Dropout for regularization

In [21]:
# Define the model
model = Sequential([
    Embedding(input_dim=20000, output_dim=EMBEDDING_DIM, input_length=50),  # Word embeddings
    LSTM(LSTM_UNITS, return_sequences=False),  # LSTM layer
    Dropout(DROPOUT_RATE),  # Dropout for preventing overfitting
    Dense(1, activation="sigmoid")  # Output layer (sigmoid for binary classification)
])

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])



In [22]:
# Model summary
model.summary()

In [23]:
# Hyperparameters
BATCH_SIZE = 64
EPOCHS = 10

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1
)

Epoch 1/10
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 6ms/step - accuracy: 0.6749 - loss: 0.5562 - val_accuracy: 0.7942 - val_loss: 0.4397
Epoch 2/10
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 6ms/step - accuracy: 0.8029 - loss: 0.4241 - val_accuracy: 0.7970 - val_loss: 0.4345
Epoch 3/10
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 6ms/step - accuracy: 0.8152 - loss: 0.4016 - val_accuracy: 0.7977 - val_loss: 0.4341
Epoch 4/10
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 6ms/step - accuracy: 0.8276 - loss: 0.3801 - val_accuracy: 0.7963 - val_loss: 0.4411
Epoch 5/10
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 6ms/step - accuracy: 0.8393 - loss: 0.3586 - val_accuracy: 0.7945 - val_loss: 0.4496
Epoch 6/10
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 6ms/step - accuracy: 0.8520 - loss: 0.3347 - val_accuracy: 0.7897 - val_loss: