## **0. Tải bộ dữ liệu**
**Lưu ý:** Nếu bạn không thể sử dụng lệnh gdown để tải bộ dữ liệu vì bị giới hạn số lượt tải, hãy tải bộ dữ liệu thử công và upload lên google drive của mình. Sau đó, sử dụng lệnh dưới đây để copy file dữ liệu vào colab:
```python
from google.colab import drive

drive.mount('/content/drive')
!cp /path/to/dataset/on/your/drive .
```

In [None]:
!pip install -qq faiss-cpu
!pip install -qq transformers
!pip install -qq pandas
!pip install -qq numpy
!pip install -qq scikit-learn
!pip install -qq tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# https://drive.google.com/file/d/1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R/view?usp=sharing
!gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

## **1. Import các thư viện cần thiết**

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import faiss
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

## **2. Đọc bộ dữ liệu**

In [None]:
DATASET_PATH = '/content/2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)
df

In [None]:
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

## **3. Chuẩn bị embedding model và dữ liệu**

### **3.1. Load embedding model**

In [None]:
# Load embedding model
MODEL_NAME = 'intfloat/multilingual-e5-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()

print(f'Using device: {device}')
print(f'Model loaded: {MODEL_NAME}')

def average_pool(last_hidden_states, attention_mask):
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

### **3.2. Tạo sentence embeddings**

In [None]:
def get_embeddings(texts, model, tokenizer, device, batch_size=32):
    """Generate embeddings for a list of texts"""
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i+batch_size]

        # Add passage prefix for better retrieval performance
        batch_texts_with_prefix = [f"passage: {text}" for text in batch_texts]

        # Tokenize
        batch_dict = tokenizer(batch_texts_with_prefix,
                              max_length=512,
                              padding=True,
                              truncation=True,
                              return_tensors='pt')

        # Move to device
        batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

        # Generate embeddings
        with torch.no_grad():
            outputs = model(**batch_dict)
            batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
            # Normalize embeddings
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
            embeddings.append(batch_embeddings.cpu().numpy())

    return np.vstack(embeddings)

In [None]:
# Prepare labels
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')

# Generate embeddings for all messages
print(f"Generating embeddings for {len(messages)} messages...")
X_embeddings = get_embeddings(messages, model, tokenizer, device)
print(f"Embeddings shape: {X_embeddings.shape}")

# Create metadata for each document
metadata = []
for i, (message, label) in enumerate(zip(messages, labels)):
    metadata.append({
        'index': i,
        'message': message,
        'label': label,
        'label_encoded': y[i]
    })

print(f"Created metadata for {len(metadata)} documents")

### **3.3. Tạo FAISS index và chia dữ liệu**

In [None]:
# Split data into train and test (90% train, 10% test)
TEST_SIZE = 0.1
SEED = 42

train_indices, test_indices = train_test_split(
    range(len(messages)),
    test_size=TEST_SIZE,
    stratify=y,
    random_state=SEED
)

# Split embeddings and metadata
X_train_emb = X_embeddings[train_indices]
X_test_emb = X_embeddings[test_indices]
y_train = y[train_indices]
y_test = y[test_indices]

train_metadata = [metadata[i] for i in train_indices]
test_metadata = [metadata[i] for i in test_indices]

print(f"Train size: {len(X_train_emb)}")
print(f"Test size: {len(X_test_emb)}")
print(f"Train label distribution: {np.bincount(y_train)}")
print(f"Test label distribution: {np.bincount(y_test)}")

# Create FAISS index
embedding_dim = X_train_emb.shape[1]
index = faiss.IndexFlatIP(embedding_dim)  # Inner product for cosine similarity
index.add(X_train_emb.astype('float32'))

print(f"FAISS index created with {index.ntotal} vectors")

## **4. Implement classification với embedding similarity**

In [None]:
def classify_with_knn(query_text, model, tokenizer, device, index, train_metadata, k=1):
    """Classify text using k-nearest neighbors with embeddings"""

    # Get query embedding
    query_with_prefix = f"query: {query_text}"
    batch_dict = tokenizer([query_with_prefix],
                          max_length=512,
                          padding=True,
                          truncation=True,
                          return_tensors='pt')

    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

    with torch.no_grad():
        outputs = model(**batch_dict)
        query_embedding = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        query_embedding = F.normalize(query_embedding, p=2, dim=1)
        query_embedding = query_embedding.cpu().numpy().astype('float32')

    # Search in FAISS index
    scores, indices = index.search(query_embedding, k)

    # Get predictions from top-k neighbors
    predictions = []
    neighbor_info = []

    for i in range(k):
        neighbor_idx = indices[0][i]
        neighbor_score = scores[0][i]
        neighbor_label = train_metadata[neighbor_idx]['label']
        neighbor_message = train_metadata[neighbor_idx]['message']

        predictions.append(neighbor_label)
        neighbor_info.append({
            'score': float(neighbor_score),
            'label': neighbor_label,
            'message': neighbor_message[:100] + "..." if len(neighbor_message) > 100 else neighbor_message
        })

    # Majority vote for final prediction
    unique_labels, counts = np.unique(predictions, return_counts=True)
    final_prediction = unique_labels[np.argmax(counts)]

    return final_prediction, neighbor_info

def evaluate_knn_accuracy(test_embeddings, test_labels, test_metadata, index, train_metadata, k_values=[1, 3, 5]):
    """Evaluate accuracy for different k values using precomputed embeddings"""
    results = {}
    all_errors = {}

    for k in k_values:
        correct = 0
        total = len(test_embeddings)
        errors = []

        for i in tqdm(range(total), desc=f"Evaluating k={k}"):
            query_embedding = test_embeddings[i:i+1].astype('float32')
            true_label = test_metadata[i]['label']
            true_message = test_metadata[i]['message']

            # Search in FAISS index
            scores, indices = index.search(query_embedding, k)

            # Get predictions from top-k neighbors
            predictions = []
            neighbor_details = []
            for j in range(k):
                neighbor_idx = indices[0][j]
                neighbor_label = train_metadata[neighbor_idx]['label']
                neighbor_message = train_metadata[neighbor_idx]['message']
                neighbor_score = float(scores[0][j])

                predictions.append(neighbor_label)
                neighbor_details.append({
                    'label': neighbor_label,
                    'message': neighbor_message,
                    'score': neighbor_score
                })

            # Majority vote
            unique_labels, counts = np.unique(predictions, return_counts=True)
            predicted_label = unique_labels[np.argmax(counts)]

            if predicted_label == true_label:
                correct += 1
            else:
                # Collect error information
                error_info = {
                    'index': i,
                    'original_index': test_metadata[i]['index'],
                    'message': true_message,
                    'true_label': true_label,
                    'predicted_label': predicted_label,
                    'neighbors': neighbor_details,
                    'label_distribution': {label: int(count) for label, count in zip(unique_labels, counts)}
                }
                errors.append(error_info)

        accuracy = correct / total
        error_count = total - correct

        results[k] = accuracy
        all_errors[k] = errors

        print(f"Accuracy with k={k}: {accuracy:.4f}")
        print(f"Number of errors with k={k}: {error_count}/{total} ({(error_count/total)*100:.2f}%)")

    return results, all_errors

## **5. Đánh giá accuracy trên test set**

In [None]:
%%time
# Evaluate accuracy for different k values
print("Evaluating accuracy on test set...")
accuracy_results, error_results = evaluate_knn_accuracy(
    X_test_emb,
    y_test,
    test_metadata,
    index,
    train_metadata,
    k_values=[1, 3, 5]
)

# Display results
print("\n" + "="*50)
print("ACCURACY RESULTS")
print("="*50)
for k, accuracy in accuracy_results.items():
    print(f"Top-{k} accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("="*50)

# Save error analysis to JSON file
import json
from datetime import datetime

error_analysis = {
    'timestamp': datetime.now().isoformat(),
    'model': MODEL_NAME,
    'test_size': len(X_test_emb),
    'accuracy_results': accuracy_results,
    'errors_by_k': {}
}

for k, errors in error_results.items():
    error_analysis['errors_by_k'][f'k_{k}'] = {
        'total_errors': len(errors),
        'error_rate': len(errors) / len(X_test_emb),
        'errors': errors
    }

# Save to JSON file
output_file = 'error_analysis.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(error_analysis, f, ensure_ascii=False, indent=2)

print(f"\n***Error analysis saved to: {output_file}***")
print()
print(f"***Summary:")
for k, errors in error_results.items():
    print(f"   k={k}: {len(errors)} errors out of {len(X_test_emb)} samples")


## **6. Pipeline classification cho user input**

In [None]:
def spam_classifier_pipeline(user_input, k=3):
    """
    Complete pipeline for spam classification

    Args:
        user_input (str): Text to classify
        k (int): Number of nearest neighbors to consider

    Returns:
        dict: Classification results with details
    """

    print()
    print(f"***Classifying: '{user_input}'")
    print()
    print(f"***Using top-{k} nearest neighbors")
    print()

    # Get prediction and neighbors
    prediction, neighbors = classify_with_knn(
        user_input, model, tokenizer, device, index, train_metadata, k=k
    )

    # Display results
    print(f"***Prediction: {prediction.upper()}")
    print()

    print("***Top neighbors:")
    for i, neighbor in enumerate(neighbors, 1):
        print(f"{i}. Label: {neighbor['label']} | Score: {neighbor['score']:.4f}")
        print(f"   Message: {neighbor['message']}")
        print()

    # Count label distribution
    labels = [n['label'] for n in neighbors]
    label_counts = {label: labels.count(label) for label in set(labels)}

    return {
        'prediction': prediction,
        'neighbors': neighbors,
        'label_distribution': label_counts
    }

## **7. Test pipeline với các ví dụ**

In [None]:
# Test với các ví dụ khác nhau
test_examples = [
    "I am actually thinking a way of doing something useful",
    "FREE!! Click here to win $1000 NOW! Limited time offer!",
    # "Hey, can you pick me up at 5pm today?",
    # "URGENT: Your account will be suspended unless you verify your details NOW",
    # "Thanks for the meeting today, let's schedule the next one for next week",
    # "Congratulations! You've won a prize! Call this number to claim it"
]

print("Testing pipeline with different examples:")
print()

for i, example in enumerate(test_examples, 1):
    print(f"\n***Example {i}:")
    result = spam_classifier_pipeline(example, k=3)
    print()

In [None]:
# Interactive testing - user có thể thay đổi text và k value
print("***Interactive Testing")
print()

# Người dùng có thể thay đổi các giá trị này để test với các ví dụ khác nhau
user_text = "Win a free iPhone! Click here now!"
k_value = 5

print(f"***Testing with k={k_value}")
result = spam_classifier_pipeline(user_text, k=k_value)

print("***To test with different inputs:")
print("1. Change 'user_text' variable above")
print("2. Change 'k_value' for different number of neighbors")
print("3. Re-run this cell")