## 1. Import Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import pickle
import os

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

print("\nLibraries imported successfully!")

## 2. Load Dataset

In [None]:
# Load dataset
print("Loading dataset...")
df = pd.read_csv('../../data/cleaned_label.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nSentiment distribution:")
print(df['sentiment_label'].value_counts())
print(f"\nFirst few rows:")
print(df.head())

## 3. Initialize BERT Base Cased Model

In [None]:
# Model name - BERT Base Cased (preserves capitalization)
MODEL_NAME = 'bert-base-cased'

print(f"Loading BERT Cased model: {MODEL_NAME}")
print("Note: This model preserves case information (uppercase/lowercase)")

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)

# Move to device
model = model.to(device)
model.eval()  # Set to evaluation mode

print(f"\nModel loaded successfully!")
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Hidden size: {model.config.hidden_size}")
print(f"Number of layers: {model.config.num_hidden_layers}")
print(f"Number of attention heads: {model.config.num_attention_heads}")

## 4. Extract Embeddings Function

In [None]:
def get_bert_embedding(text, tokenizer, model, device, max_length=128):
    """
    Extract BERT Cased [CLS] token embedding for a text
    
    Args:
        text: str - Input text (case preserved)
        tokenizer: BERT tokenizer
        model: BERT model
        device: torch device
        max_length: int - Max sequence length
        
    Returns:
        numpy array: 768-dimensional embedding vector
    """
    # Tokenize (case preserved)
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Extract [CLS] token embedding (first token)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
    
    return cls_embedding.cpu().numpy().flatten()

print("Embedding extraction function defined!")

# Test on sample text (note case sensitivity)
sample_text = "This MOVIE is GREAT!"
sample_embedding = get_bert_embedding(sample_text, tokenizer, model, device)
print(f"\nSample text: '{sample_text}'")
print(f"Sample embedding shape: {sample_embedding.shape}")
print(f"Sample embedding preview (first 10 values): {sample_embedding[:10]}")

## 5. Extract Embeddings for All Documents

In [None]:
# Parameters
MAX_LENGTH = 128
BATCH_SIZE = 32  # Process in batches to save memory

print("Extracting BERT Cased embeddings for all documents...")
print(f"Model: {MODEL_NAME}")
print(f"Max length: {MAX_LENGTH}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Total documents: {len(df)}\n")

embeddings = []

# Process in batches
for i in tqdm(range(0, len(df), BATCH_SIZE)):
    batch_texts = df['review_text'].iloc[i:i+BATCH_SIZE].values
    
    for text in batch_texts:
        embedding = get_bert_embedding(str(text), tokenizer, model, device, MAX_LENGTH)
        embeddings.append(embedding)

# Convert to numpy array
embeddings = np.array(embeddings)

print(f"\nEmbeddings extracted!")
print(f"Shape: {embeddings.shape}")
print(f"Each document: {embeddings.shape[1]}-dimensional vector")

## 6. Save Embeddings

In [None]:
# Create output directory
output_dir = '../../models/bert_cased_embeddings'
os.makedirs(output_dir, exist_ok=True)

# Save embeddings
embeddings_path = os.path.join(output_dir, 'bert_cased_embeddings.npy')
np.save(embeddings_path, embeddings)
print(f"Embeddings saved: {embeddings_path}")

# Save labels
labels_path = os.path.join(output_dir, 'labels.npy')
np.save(labels_path, df['sentiment_label'].values)
print(f"Labels saved: {labels_path}")

# Save metadata
metadata = {
    'model_name': MODEL_NAME,
    'model_type': 'bert-base-cased',
    'case_sensitive': True,
    'hidden_size': model.config.hidden_size,
    'max_length': MAX_LENGTH,
    'num_documents': len(embeddings),
    'embedding_shape': embeddings.shape,
    'extraction_method': '[CLS] token from last_hidden_state'
}

metadata_path = os.path.join(output_dir, 'metadata.pkl')
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
print(f"Metadata saved: {metadata_path}")

print("\n" + "="*80)
print("All files saved successfully!")
print("="*80)
print(f"\nOutput directory: {output_dir}")
print(f"Files:")
print(f"  - bert_cased_embeddings.npy    (Embeddings: {embeddings.shape})")
print(f"  - labels.npy                   (Sentiment labels)")
print(f"  - metadata.pkl                 (Model metadata)")

## 7. Verify Saved Embeddings

In [None]:
# Load saved embeddings
print("Loading saved embeddings...\n")

loaded_embeddings = np.load(embeddings_path)
loaded_labels = np.load(labels_path)

with open(metadata_path, 'rb') as f:
    loaded_metadata = pickle.load(f)

print(f"‚úì Embeddings loaded: {loaded_embeddings.shape}")
print(f"‚úì Labels loaded: {loaded_labels.shape}")
print(f"\n‚úì Metadata:")
for key, value in loaded_metadata.items():
    print(f"    {key}: {value}")

# Verify integrity
print(f"\n‚úì Verification:")
print(f"    Embeddings match: {np.allclose(embeddings, loaded_embeddings)}")
print(f"    Labels match: {np.array_equal(df['sentiment_label'].values, loaded_labels)}")

## 8. Embedding Statistics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print("Embedding Statistics:\n")

# Basic stats
print(f"Shape: {embeddings.shape}")
print(f"Mean: {embeddings.mean():.4f}")
print(f"Std: {embeddings.std():.4f}")
print(f"Min: {embeddings.min():.4f}")
print(f"Max: {embeddings.max():.4f}")

# Plot distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Overall distribution
axes[0].hist(embeddings.flatten(), bins=100, alpha=0.7, color='blue')
axes[0].set_xlabel('Embedding Value')
axes[0].set_ylabel('Frequency')
axes[0].set_title('BERT Cased Embedding Value Distribution')
axes[0].grid(True, alpha=0.3)

# Mean embedding per document
mean_embeddings = embeddings.mean(axis=1)
axes[1].hist(mean_embeddings, bins=50, alpha=0.7, color='green')
axes[1].set_xlabel('Mean Embedding Value')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Mean Embedding per Document')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nStatistics plotted!")

## 9. Compare Positive vs Negative Embeddings

In [None]:
from sklearn.decomposition import PCA

# Sample for visualization
sample_size = 1000
indices = np.random.choice(len(embeddings), size=min(sample_size, len(embeddings)), replace=False)

sample_embeddings = embeddings[indices]
sample_labels = df['sentiment_label'].iloc[indices].values

# PCA reduction to 2D
print(f"Performing PCA on {len(sample_embeddings)} samples...")
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(sample_embeddings)

print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2%}")

# Plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=sample_labels,
    cmap='RdYlGn',
    alpha=0.6,
    s=30
)
plt.colorbar(scatter, label='Sentiment (0=Neg, 1=Pos)')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('BERT Cased Embeddings Visualization (PCA)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nVisualization complete!")

## 10. Case Sensitivity Test

In [None]:
# Test case sensitivity
print("Testing case sensitivity of BERT Cased model:\n")

test_sentences = [
    ("this movie is great", "This movie is great"),
    ("the movie was EXCELLENT", "the movie was excellent"),
    ("I LOVED IT", "i loved it")
]

from sklearn.metrics.pairwise import cosine_similarity

for lower, upper in test_sentences:
    emb1 = get_bert_embedding(lower, tokenizer, model, device)
    emb2 = get_bert_embedding(upper, tokenizer, model, device)
    
    similarity = cosine_similarity([emb1], [emb2])[0][0]
    
    print(f"Sentence 1: '{lower}'")
    print(f"Sentence 2: '{upper}'")
    print(f"Cosine similarity: {similarity:.4f}")
    print(f"Different vectors: {not np.allclose(emb1, emb2)}")
    print("-" * 60)

print("\nNote: BERT Cased treats uppercase/lowercase differently!")

## 11. Summary

In [None]:
print("\n" + "="*80)
print("BERT BASE CASED EMBEDDING EXTRACTION SUMMARY")
print("="*80)

print(f"\nüìä Model: {MODEL_NAME}")
print(f"üî§ Case Sensitive: YES (preserves capitalization)")
print(f"üìè Embedding dimension: {embeddings.shape[1]}")
print(f"üìÅ Total documents: {embeddings.shape[0]:,}")
print(f"üíæ Total size: {embeddings.nbytes / (1024**2):.2f} MB")

print(f"\n‚úÖ Files saved:")
print(f"   {output_dir}/")
print(f"   ‚îú‚îÄ‚îÄ bert_cased_embeddings.npy  ({embeddings.shape})")
print(f"   ‚îú‚îÄ‚îÄ labels.npy                 ({loaded_labels.shape})")
print(f"   ‚îî‚îÄ‚îÄ metadata.pkl")

print(f"\nüí° Usage:")
print(f"   These embeddings can be used with any classifier:")
print(f"   - Logistic Regression")
print(f"   - SVM")
print(f"   - Random Forest")
print(f"   - Neural Networks")

print(f"\nüîß Load embeddings:")
print(f"   X = np.load('{embeddings_path}')")
print(f"   y = np.load('{labels_path}')")

print(f"\n‚ö° Difference from bert-base-uncased:")
print(f"   - Preserves uppercase/lowercase information")
print(f"   - Better for texts where case matters (e.g., proper nouns)")
print(f"   - May perform better on sentiment (e.g., 'GREAT' vs 'great')")

print("\n" + "="*80)
print("‚úÖ BERT Cased embedding extraction complete!")
print("="*80)