In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from transformers import TFBertForSequenceClassification, BertTokenizer
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("bike_rental_reviews.csv")
df.dropna(inplace=True)

# Text Cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df['cleaned_text'] = df['review_text'].apply(clean_text)

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['cleaned_text'])
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
print("Logistic Regression Report:\n", classification_report(y_test, lr.predict(X_test)))

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
print("Naive Bayes Report:\n", classification_report(y_test, nb.predict(X_test)))

# Tokenize & Padding for LTSM
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
padded = pad_sequences(sequences, maxlen=100, padding='post')

label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
labels = df['sentiment'].map(label_map).values
X_train_pad, X_test_pad, y_train_pad, y_test_pad = train_test_split(padded, labels, test_size=0.2, random_state=42)

# LTSM
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    LSTM(64, dropout=0.3, recurrent_dropout=0.3),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_pad, y_train_pad, validation_data=(X_test_pad, y_test_pad), epochs=3, batch_size=64)

### Due To Dependency Issues Below is the only I could get BERT to run successfully in COLAB ###
### I was having similar issues within Jupyter Notebooks as well and really spent the bulk of my time troubleshooting, apologies###

# BERT
# Encode sentiment to integer labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])  # e.g. negative: 0, neutral: 1, positive: 2

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['review_text'], df['label'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128, return_tensors="tf")
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128, return_tensors="tf")

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings), y_train)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings), y_test)).batch(16)

# Load BERT model and define training components
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

# Train using GradientTape
for epoch in range(2):
    print(f"\nEpoch {epoch+1}")
    accuracy.reset_state()

    for step, (batch, labels) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            logits = model(batch, training=True).logits
            loss = loss_fn(labels, logits)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        accuracy.update_state(labels, logits)

        if step % 10 == 0:
            print(f"Step {step}, Loss: {loss:.4f}, Accuracy: {accuracy.result().numpy():.4f}")

# Evaluate with classification report and confusion matrix
y_pred = []
y_true = []

for batch, labels in test_dataset:
    logits = model(batch, training=False).logits
    preds = tf.argmax(logits, axis=1)
    y_pred.extend(preds.numpy())
    y_true.extend(labels.numpy())

# Print classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=le.classes_))

# Plot confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Logistic Regression Report:
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00      3325
     neutral       1.00      1.00      1.00      3317
    positive       1.00      1.00      1.00      3358

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Naive Bayes Report:
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00      3325
     neutral       1.00      1.00      1.00      3317
    positive       1.00      1.00      1.00      3358

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000





Epoch 1/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 187ms/step - accuracy: 0.3339 - loss: 1.0992 - val_accuracy: 0.3358 - val_loss: 1.0989
Epoch 2/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 172ms/step - accuracy: 0.3405 - loss: 1.0987 - val_accuracy: 0.3317 - val_loss: 1.0989
Epoch 3/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 169ms/step - accuracy: 0.3337 - loss: 1.0988 - val_accuracy: 0.3325 - val_loss: 1.0987


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
Step 0, Loss: 1.0425, Accuracy: 0.5000
Step 10, Loss: 0.8519, Accuracy: 0.5227
Step 20, Loss: 0.4944, Accuracy: 0.6935
Step 30, Loss: 0.3089, Accuracy: 0.7923
Step 40, Loss: 0.2151, Accuracy: 0.8430
Step 50, Loss: 0.1328, Accuracy: 0.8738
Step 60, Loss: 0.0987, Accuracy: 0.8945
Step 70, Loss: 0.0642, Accuracy: 0.9093
Step 80, Loss: 0.0553, Accuracy: 0.9205
Step 90, Loss: 0.0439, Accuracy: 0.9293
Step 100, Loss: 0.0338, Accuracy: 0.9363
Step 110, Loss: 0.0391, Accuracy: 0.9420
Step 120, Loss: 0.0334, Accuracy: 0.9468
Step 130, Loss: 0.0249, Accuracy: 0.9509
Step 140, Loss: 0.0188, Accuracy: 0.9543
Step 150, Loss: 0.0157, Accuracy: 0.9574
Step 160, Loss: 0.0175, Accuracy: 0.9600
Step 170, Loss: 0.0185, Accuracy: 0.9624
Step 180, Loss: 0.0174, Accuracy: 0.9644
Step 190, Loss: 0.0120, Accuracy: 0.9663
Step 200, Loss: 0.0142, Accuracy: 0.9680
Step 210, Loss: 0.0113, Accuracy: 0.9695
Step 220, Loss: 0.0119, Accuracy: 0.9709
Step 230, Loss: 0.0103, Accuracy: 0.9721
Step 240, Loss: 0.

KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')