In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from transformers import TFBertForSequenceClassification, BertTokenizer
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv("bike_rental_reviews.csv")
df.dropna(inplace=True)

# Text Cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df['cleaned_text'] = df['review_text'].apply(clean_text)

In [None]:
# TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['cleaned_text'])
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
print("Logistic Regression Report:\n", classification_report(y_test, lr.predict(X_test)))

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
print("Naive Bayes Report:\n", classification_report(y_test, nb.predict(X_test)))

# Tokenize & Padding for LTSM
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
padded = pad_sequences(sequences, maxlen=100, padding='post')

label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
labels = df['sentiment'].map(label_map).values
X_train_pad, X_test_pad, y_train_pad, y_test_pad = train_test_split(padded, labels, test_size=0.2, random_state=42)

In [None]:
# LTSM
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    LSTM(64, dropout=0.3, recurrent_dropout=0.3),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_pad, y_train_pad, validation_data=(X_test_pad, y_test_pad), epochs=3, batch_size=64)

In [None]:
### Due To Dependency Issues Below is the only I could get BERT to run successfully in COLAB ###
### I was having similar issues within Jupyter Notebooks as well and really spent the bulk of my time troubleshooting, apologies###

# BERT
# Encode sentiment to integer labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])  # e.g. negative: 0, neutral: 1, positive: 2

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['review_text'], df['label'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128, return_tensors="tf")
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128, return_tensors="tf")

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings), y_train)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings), y_test)).batch(16)

# Load BERT model and define training components
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

# Train using GradientTape
for epoch in range(2):
    print(f"\nEpoch {epoch+1}")
    accuracy.reset_state()

    for step, (batch, labels) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            logits = model(batch, training=True).logits
            loss = loss_fn(labels, logits)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        accuracy.update_state(labels, logits)

        if step % 10 == 0:
            print(f"Step {step}, Loss: {loss:.4f}, Accuracy: {accuracy.result().numpy():.4f}")

# Evaluate with classification report and confusion matrix
y_pred = []
y_true = []

for batch, labels in test_dataset:
    logits = model(batch, training=False).logits
    preds = tf.argmax(logits, axis=1)
    y_pred.extend(preds.numpy())
    y_true.extend(labels.numpy())


In [None]:
# Print classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=le.classes_))

# Plot confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()