Imports

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from keras.layers import LayerNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

Load the dataset

In [39]:
df = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'])

df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [40]:
df['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [41]:
df = df[['text', 'target']]
df['target'] = df['target'].replace(4, 1)
df['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

Take the first 5000 records of each class for simplicity

In [42]:
negative_samples = df[df['target'] == 0]
positive_samples = df[df['target'] == 1]

df = pd.concat([negative_samples, positive_samples])

df['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1600000 non-null  object
 1   target  1600000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 36.6+ MB


In [44]:
df.isnull().sum()

text      0
target    0
dtype: int64

In [45]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@\w+', '', text)  # Remove @ symbol and the usernames
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove links
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords
    return text

In [47]:
df['text'] = df['text'].apply(clean_text)


df['text'].head()

0        thats bummer shoulda got david carr third day
1    upset cant update facebook texting might cry r...
2    dived many times ball managed save 50 rest go ...
3                     whole body feels itchy like fire
4                             behaving im mad cant see
Name: text, dtype: object

Split data into train and test

In [48]:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

TF-IDF vectorization for text feature extraction

In [51]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_val_tfidf = vectorizer.transform(X_val).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

Train the model

In [13]:
gnb = GaussianNB()
gnb.fit(X_train_tfidf, y_train)

Test the model

In [14]:
y_pred = gnb.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.7029


In [15]:
print("Classification Report (GNB):")
print(classification_report(y_test, y_pred))

Classification Report (GNB):
              precision    recall  f1-score   support

           0       0.74      0.63      0.68    119907
           1       0.68      0.78      0.72    120093

    accuracy                           0.70    240000
   macro avg       0.71      0.70      0.70    240000
weighted avg       0.71      0.70      0.70    240000



Test the model with an external data

In [16]:
review = ["Loved This Movie !!!!"]
review_tfidf = vectorizer.transform(review).toarray()
review_pred = gnb.predict(review_tfidf)

if review_pred[0] == 1:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")


Sentiment: Positive


In [17]:
review = ["The service was terrible, and I’ll never use it again."]
review_tfidf = vectorizer.transform(review).toarray()
review_pred = gnb.predict(review_tfidf)

if review_pred[0] == 1:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")


Sentiment: Negative


In [52]:
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [53]:
text_lengths = [len(seq) for seq in tokenizer.texts_to_sequences(X_train)]

max_len = max(text_lengths)  
print(f"Max Length chosen: {max_len}")

Max Length chosen: 35


In [20]:
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [21]:
lstm_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),
    LSTM(128, return_sequences=True),
    LayerNormalization(),
    Dropout(0.4),
    LSTM(64),
    LayerNormalization(),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])



In [22]:
# Define the callbacks
early_stopping = EarlyStopping(
    monitor='val_accuracy',   
    patience=3,               
    restore_best_weights=True 
)

model_checkpoint = ModelCheckpoint(
    filepath='best_lstm_model.keras',  
    monitor='val_accuracy',        
    save_best_only=True,            
    mode='max',                    
    verbose=1                      
)

In [23]:
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [24]:
history = lstm_model.fit(
    X_train_padded, y_train,
    validation_data=(X_val_padded, y_val),
    epochs=20,                     
    batch_size=64,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/20
[1m17498/17500[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.7454 - loss: 0.5090
Epoch 1: val_accuracy improved from -inf to 0.78473, saving model to best_lstm_model.keras
[1m17500/17500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 9ms/step - accuracy: 0.7454 - loss: 0.5090 - val_accuracy: 0.7847 - val_loss: 0.4533
Epoch 2/20
[1m17494/17500[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.7880 - loss: 0.4475
Epoch 2: val_accuracy improved from 0.78473 to 0.78957, saving model to best_lstm_model.keras
[1m17500/17500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 9ms/step - accuracy: 0.7880 - loss: 0.4475 - val_accuracy: 0.7896 - val_loss: 0.4449
Epoch 3/20
[1m17497/17500[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.7961 - loss: 0.4335
Epoch 3: val_accuracy improved from 0.78957 to 0.79041, saving model to best_lstm_model.keras
[1m17500/17500[0m [32m━━━━━━━━━━

In [25]:
# Evaluate LSTM
y_pred_lstm = (lstm_model.predict(X_test_padded) > 0.5).astype('int32')
accuracy_lstm = accuracy_score(y_test, y_pred_lstm)
print(f"LSTM Test Accuracy: {accuracy_lstm:.4f}")

[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step
LSTM Test Accuracy: 0.7899


In [26]:
print("Classification Report (LSTM):")
print(classification_report(y_test, y_pred_lstm))

Classification Report (LSTM):
              precision    recall  f1-score   support

           0       0.78      0.81      0.79    119907
           1       0.80      0.77      0.79    120093

    accuracy                           0.79    240000
   macro avg       0.79      0.79      0.79    240000
weighted avg       0.79      0.79      0.79    240000



In [27]:
# Input review
review = ["Loved This Movie !!!!"]

# Preprocess the review
review_seq = tokenizer.texts_to_sequences(review)  # Tokenize the review
review_padded = pad_sequences(review_seq, maxlen=max_len, padding='post', truncating='post')  # Pad the sequence

# Predict sentiment using the LSTM model
review_pred = lstm_model.predict(review_padded)

# Interpret the result
if review_pred[0] > 0.5:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step
Sentiment: Positive


In [28]:
# Input review
review = ["I absolutely love this place; it’s the best experience I’ve ever had!" ]

# Preprocess the review
review_seq = tokenizer.texts_to_sequences(review)  # Tokenize the review
review_padded = pad_sequences(review_seq, maxlen=max_len, padding='post', truncating='post')  # Pad the sequence

# Predict sentiment using the LSTM model
review_pred = lstm_model.predict(review_padded)

# Interpret the result
if review_pred[0] > 0.5:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Sentiment: Positive


In [54]:
# Calculate text lengths
text_lengths = [len(seq) for seq in tokenizer.texts_to_sequences(X_train)]

# Choose a max length based on a percentile
max_len = int(np.percentile(text_lengths, 95))  # Covers 95% of the data
print(f"Max Length chosen: {max_len}")

Max Length chosen: 15


In [57]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_len = 15

# Tokenize the training data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_len, return_tensors="tf")
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=max_len, return_tensors="tf")
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=max_len, return_tensors="tf")

In [58]:
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Convert the labels to tensor format
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
test_labels = torch.tensor(y_test)

In [59]:
# Convert TensorFlow tensors to PyTorch tensors
train_input_ids = torch.tensor(train_encodings['input_ids'].numpy()) 
train_attention_mask = torch.tensor(train_encodings['attention_mask'].numpy())
val_input_ids = torch.tensor(val_encodings['input_ids'].numpy())
val_attention_mask = torch.tensor(val_encodings['attention_mask'].numpy())
test_input_ids = torch.tensor(test_encodings['input_ids'].numpy())
test_attention_mask = torch.tensor(test_encodings['attention_mask'].numpy())

# Create TensorDatasets using PyTorch tensors
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)


In [60]:
# Create DataLoaders for batching
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)

In [61]:
from transformers import BertForSequenceClassification
from transformers import AdamW

# Initialize the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Move the model to the second GPU (GPU 1)
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Optional: Use learning rate scheduler
from transformers import get_linear_schedule_with_warmup
num_train_steps = len(train_dataloader) * 2
num_warmup_steps = 0
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
from torch.nn import functional as F
from sklearn.metrics import accuracy_score
from tqdm import tqdm  

# Training parameters
num_epochs = 2

# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Set the model to train mode
    model.train()
    total_train_loss = 0

    # Use tqdm for training progress
    train_progress = tqdm(train_dataloader, desc="Training", leave=False)
    for batch in train_progress:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # Update progress bar with the current loss
        train_progress.set_postfix(loss=loss.item())

    # Print average training loss for this epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Training loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    all_preds = []
    all_labels = []

    # Use tqdm for validation progress
    val_progress = tqdm(val_dataloader, desc="Validation", leave=False)
    with torch.no_grad():
        for batch in val_progress:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get predictions
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    # Calculate validation accuracy
    val_accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {val_accuracy:.4f}\n")

Epoch 1/2


                                                                         

Training loss: 0.4460


                                                               

Validation Accuracy: 0.8042

Epoch 2/2


                                                                         

Training loss: 0.4067


                                                               

Validation Accuracy: 0.8083



In [63]:
# Testing the model
model.eval() 
test_preds = []
test_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        test_preds.extend(preds)
        test_labels.extend(labels.cpu().numpy())

# Test accuracy
test_accuracy = accuracy_score(test_labels, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.8082


In [64]:
print("Classification Report (Bert):")
print(classification_report(test_labels, test_preds))

Classification Report (Bert):
              precision    recall  f1-score   support

           0       0.80      0.81      0.81    119907
           1       0.81      0.80      0.81    120093

    accuracy                           0.81    240000
   macro avg       0.81      0.81      0.81    240000
weighted avg       0.81      0.81      0.81    240000



In [65]:
# Input review
review = ["Loved This Movie !!!!"]

# Preprocess the review using the BERT tokenizer
encoded_review = tokenizer(
    review,
    truncation=True,
    padding=True,
    max_length=max_len,
    return_tensors="pt"  # Return PyTorch tensors
)

# Move the input tensors to the device (e.g., GPU 1)
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

# Set the model to evaluation mode
model.eval()

# Perform prediction
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# Interpret the result
if predictions[0] == 1:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")


Sentiment: Positive


In [66]:
# Input review
review = ["This product works perfectly, and I couldn't be happier!"]

# Preprocess the review using the BERT tokenizer
encoded_review = tokenizer(
    review,
    truncation=True,
    padding=True,
    max_length=max_len,
    return_tensors="pt"  # Return PyTorch tensors
)

# Move the input tensors to the device (e.g., GPU 1)
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

# Set the model to evaluation mode
model.eval()

# Perform prediction
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# Interpret the result
if predictions[0] == 1:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")


Sentiment: Positive


In [67]:
# Input review
review = ["The product broke after one use—such a waste of money."]

# Preprocess the review using the BERT tokenizer
encoded_review = tokenizer(
    review,
    truncation=True,
    padding=True,
    max_length=max_len,
    return_tensors="pt"  # Return PyTorch tensors
)

# Move the input tensors to the device (e.g., GPU 1)
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

# Set the model to evaluation mode
model.eval()

# Perform prediction
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# Interpret the result
if predictions[0] == 1:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")


Sentiment: Negative
