In [None]:
# Install required libraries (run once)
# !pip install sentence-transformers torch scikit-learn -q

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time

# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

print("\nLibraries imported successfully!")

## 2. Load Dataset

In [None]:
# Load dataset
print("Loading dataset...")
df = pd.read_csv('../data/cleaned_label.csv')

# Sample subset for faster training (optional - uncomment to use)
# df = df.sample(n=5000, random_state=42).reset_index(drop=True)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nSentiment distribution:")
print(df['sentiment_label'].value_counts())
print(f"\nFirst few rows:")
print(df.head())

## 3. Initialize SBERT Model

In [None]:
# Initialize SBERT model
MODEL_NAME = 'all-MiniLM-L6-v2'

print(f"Loading SBERT model: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)

print(f"Model loaded successfully!")
print(f"Max sequence length: {model.max_seq_length}")
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")

# Test model
sample_text = "This movie is absolutely fantastic!"
embedding = model.encode(sample_text)
print(f"\nSample text: {sample_text}")
print(f"Embedding shape: {embedding.shape}")
print(f"Embedding preview (first 10 values): {embedding[:10]}")

## 4. Prepare Data

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['review_text'].values,
    df['sentiment_label'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment_label'].values
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"\nTrain label distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest label distribution:")
print(pd.Series(y_test).value_counts())

## 5. Generate Sentence Embeddings

In [None]:
# Generate embeddings
print("Generating sentence embeddings...")
print("This may take a few minutes...\n")

start_time = time.time()

# Encode training data
print("Encoding training data...")
train_embeddings = model.encode(
    X_train.tolist(),
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

# Encode test data
print("\nEncoding test data...")
test_embeddings = model.encode(
    X_test.tolist(),
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

embedding_time = time.time() - start_time

print(f"\nEmbedding generation completed in {embedding_time:.2f} seconds ({embedding_time/60:.2f} minutes)")
print(f"Train embeddings shape: {train_embeddings.shape}")
print(f"Test embeddings shape: {test_embeddings.shape}")

## 6. Train Classifier

In [None]:
# Train Logistic Regression classifier on embeddings
print("Training Logistic Regression classifier...")
print("="*80)

start_time = time.time()

classifier = LogisticRegression(
    max_iter=1000,
    random_state=42,
    C=1.0,
    solver='lbfgs'
)

classifier.fit(train_embeddings, y_train)

training_time = time.time() - start_time

print(f"\nClassifier training completed in {training_time:.2f} seconds")
print(f"Classifier: {classifier}")
print("="*80)

## 7. Evaluate Model

In [None]:
# Make predictions
print("Evaluating model...\n")

# Training accuracy
train_predictions = classifier.predict(train_embeddings)
train_accuracy = accuracy_score(y_train, train_predictions)

# Test accuracy
test_predictions = classifier.predict(test_embeddings)
test_accuracy = accuracy_score(y_test, test_predictions)

print("="*80)
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print("="*80)

print("\nClassification Report:")
print("-"*80)
print(classification_report(y_test, test_predictions, target_names=['Negative', 'Positive']))

# Confusion matrix
cm = confusion_matrix(y_test, test_predictions)
print("\nConfusion Matrix:")
print(cm)

## 8. Visualize Results

In [None]:
# Plot accuracy comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy bar chart
accuracies = [train_accuracy, test_accuracy]
labels = ['Train', 'Test']
colors = ['#e74c3c', '#3498db']

ax1.bar(labels, accuracies, color=colors, alpha=0.7)
ax1.set_ylabel('Accuracy')
ax1.set_title('SBERT - Train vs Test Accuracy')
ax1.set_ylim([0, 1])
ax1.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for i, (label, acc) in enumerate(zip(labels, accuracies)):
    ax1.text(i, acc + 0.02, f'{acc:.4f}', ha='center', fontweight='bold')

# Confusion Matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'],
            ax=ax2)
ax2.set_title('SBERT - Confusion Matrix')
ax2.set_ylabel('True Label')
ax2.set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

## 9. Embedding Space Visualization (t-SNE)

In [None]:
# Visualize embeddings with t-SNE (sample subset for speed)
from sklearn.manifold import TSNE

# Sample for visualization
sample_size = 1000
indices = np.random.choice(len(test_embeddings), size=min(sample_size, len(test_embeddings)), replace=False)

sample_embeddings = test_embeddings[indices]
sample_labels = y_test[indices]

print(f"Performing t-SNE on {len(sample_embeddings)} samples...")
print("This may take a minute...")

tsne = TSNE(n_components=2, random_state=42, perplexity=30)
embeddings_2d = tsne.fit_transform(sample_embeddings)

# Plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=sample_labels,
    cmap='RdYlGn',
    alpha=0.6,
    s=20
)
plt.colorbar(scatter, label='Sentiment (0=Neg, 1=Pos)')
plt.title('SBERT Sentence Embeddings Visualization (t-SNE)')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nt-SNE visualization complete!")

## 10. Test with Custom Reviews

In [None]:
def predict_sentiment_sbert(text, model, classifier):
    # Generate embedding
    embedding = model.encode([text], convert_to_numpy=True)
    
    # Predict
    prediction = classifier.predict(embedding)[0]
    probability = classifier.predict_proba(embedding)[0]
    confidence = probability[prediction]
    
    return prediction, confidence

# Test reviews
test_reviews = [
    "This movie was absolutely fantastic! The acting was superb.",
    "Terrible waste of time. Poor acting and boring plot.",
    "It was okay, not great but not terrible either.",
    "One of the best films I've ever seen!",
    "Complete garbage. Don't waste your money."
]

print("Testing custom reviews with SBERT:")
print("="*80)

for i, review in enumerate(test_reviews, 1):
    prediction, confidence = predict_sentiment_sbert(review, model, classifier)
    sentiment = "Positive ✓" if prediction == 1 else "Negative ✗"
    print(f"{i}. Review: {review}")
    print(f"   Prediction: {sentiment} (Confidence: {confidence:.2%})\n")

print("="*80)

## 11. Compare Sentence Similarities

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Example sentences
sentences = [
    "This movie is great and entertaining.",
    "This film is excellent and fun.",
    "This movie is terrible and boring.",
    "The weather is nice today."
]

# Generate embeddings
sentence_embeddings = model.encode(sentences)

# Calculate similarity matrix
similarity_matrix = cosine_similarity(sentence_embeddings)

# Visualize
plt.figure(figsize=(10, 8))
sns.heatmap(
    similarity_matrix,
    annot=True,
    fmt='.3f',
    cmap='YlOrRd',
    xticklabels=[f"S{i+1}" for i in range(len(sentences))],
    yticklabels=[f"S{i+1}" for i in range(len(sentences))],
    vmin=0,
    vmax=1
)
plt.title('SBERT - Sentence Similarity Matrix')
plt.tight_layout()
plt.show()

print("Sentences:")
for i, sent in enumerate(sentences, 1):
    print(f"S{i}: {sent}")

print("\nKey observations:")
print("- S1 and S2 (both positive) have high similarity")
print("- S1/S2 and S3 (opposite sentiment) have lower similarity")
print("- S4 (unrelated) has low similarity with all movie reviews")

## 12. Save Model and Classifier

In [None]:
import pickle

# Save SBERT model
model_save_path = './sbert_sentiment'
model.save(model_save_path)

# Save classifier
classifier_path = './sbert_classifier.pkl'
with open(classifier_path, 'wb') as f:
    pickle.dump(classifier, f)

print(f"SBERT model saved to: {model_save_path}")
print(f"Classifier saved to: {classifier_path}")
print("\n" + "="*80)
print("SBERT Summary:")
print("="*80)
print(f"Model: {MODEL_NAME}")
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Embedding time: {embedding_time:.2f} seconds ({embedding_time/60:.2f} minutes)")
print(f"Classifier training time: {training_time:.2f} seconds")
print(f"Total time: {(embedding_time + training_time):.2f} seconds ({(embedding_time + training_time)/60:.2f} minutes)")
print(f"Advantages: Fast inference, semantic understanding, sentence-level embeddings")
print("="*80)

## 13. Load Model (for future use)

In [None]:
# Example: How to load the saved model and classifier
"""
from sentence_transformers import SentenceTransformer
import pickle

# Load SBERT model
loaded_model = SentenceTransformer('./sbert_sentiment')

# Load classifier
with open('./sbert_classifier.pkl', 'rb') as f:
    loaded_classifier = pickle.load(f)

# Use for prediction
text = "This movie is amazing!"
embedding = loaded_model.encode([text])
prediction = loaded_classifier.predict(embedding)[0]
print(f"Prediction: {'Positive' if prediction == 1 else 'Negative'}")
"""

print("Model loading instructions provided above.")
print("\nSBERT training complete! ✓")