In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [18]:
# Load the dataset (example using a CSV file)
df = pd.read_csv('IMDB Dataset.csv')  # Ensure the dataset is in CSV format
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [26]:
# Reduce dataset size for faster training (optional)
df = df.sample(10000, random_state=42)

# Convert sentiment labels to binary
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split the dataset
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(df.shape)

(10000, 2)


In [20]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit features to 5000 for simplicity

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf.transform(X_test)

In [21]:
# Train Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.877
              precision    recall  f1-score   support

           0       0.89      0.86      0.87       999
           1       0.86      0.89      0.88      1001

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.88      0.88      0.88      2000



In [22]:
# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_nb = nb_model.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.8445
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       999
           1       0.86      0.83      0.84      1001

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



In [23]:
# Train SVM
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_svm = svm_model.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.86
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       999
           1       0.84      0.88      0.86      1001

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000



In [31]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

# Step 1: Load the IMDB Dataset
df = pd.read_csv('IMDB Dataset.csv')

# Convert sentiment labels to binary (positive=1, negative=0)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Step 2: Split the dataset into train and test sets
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize dataset with optimized settings
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128, return_tensors="tf")
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128, return_tensors="tf")

# Step 4: Convert to TensorFlow datasets with batch size
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(32)

# Step 5: Load BERT Model for Sequence Classification (binary sentiment classification)
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 6: Freeze all BERT layers except for the last layer
for layer in bert_model.layers[:-1]:  # Freeze all layers except the last one
    layer.trainable = False

# Step 7: Compile the BERT model
bert_model.compile(
    optimizer=Adam(learning_rate=2e-5),  # Learning rate for fine-tuning
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=[SparseCategoricalAccuracy()]
)

# Step 8: Fine-tune the BERT model (training for 3 epochs)
print("Training BERT Model (Fine-tuning only some layers)...")
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)

bert_model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=3,  # Fine-tuning for 3 epochs
    batch_size=32,
    callbacks=[early_stopping]
)

# Step 9: Evaluate the BERT model
loss, accuracy = bert_model.evaluate(test_dataset)
print(f"BERT Test Accuracy (Fine-tuned some layers): {accuracy:.4f}")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'Variable' object has no attribute '_distribute_strategy'