In [None]:
# VNPT MONEY FAQ - FINE-TUNING NOTEBOOK
# =====================================
# 
# Notebook n√†y s·ª≠ d·ª•ng d·ªØ li·ªáu t·ª´ paraphrase_documents_clean.json ƒë·ªÉ fine-tune model Vietnamese SBERT
# v·ªõi ph∆∞∆°ng ph√°p MNRL (Multiple Negatives Ranking Loss)
#
# C√ÅCH S·ª¨ D·ª§NG:
# 1. Ch·∫°y cell install dependencies
# 2. Ch·∫°y cell ki·ªÉm tra file
# 3. Ch·∫°y cell test parsing
# 4. Ch·∫°y cell t·∫°o fine-tuning script
# 5. Ch·∫°y cell fine-tuning
#
# Y√äU C·∫¶U:
# - GPU (T4 ho·∫∑c P100)
# - Dataset v·ªõi file paraphrase_documents_clean.json
#
# K·∫æT QU·∫¢:
# - Model s·∫Ω ƒë∆∞·ª£c l∆∞u trong /kaggle/working/models/vnpt-sbert-mnrl/

print("üìò VNPT Money FAQ Fine-tuning Notebook")
print("=" * 50)
print("‚úÖ S·ª≠ d·ª•ng d·ªØ li·ªáu t·ª´: /data/paraphrase_documents_clean.json")
print("‚úÖ Base model: keepitreal/vietnamese-sbert")
print("‚úÖ Method: MNRL (Multiple Negatives Ranking Loss)")
print("=" * 50)

In [None]:
%%time
# Install required packages v·ªõi phi√™n b·∫£n t∆∞∆°ng th√≠ch
!pip install -q sentence-transformers==2.2.2 torch transformers

print("‚úÖ Packages installed!")

In [None]:
# Ki·ªÉm tra file paraphrase_documents_clean.json t·ª´ Kaggle dataset
import os
import json

# ƒê∆∞·ªùng d·∫´n tr√™n Kaggle
json_file = "/kaggle/input/vnpt-money-faq-data/paraphrase_documents_clean.json"

if os.path.exists(json_file):
    file_size = os.path.getsize(json_file) / 1024  # KB
    print(f"‚úÖ Found {json_file}")
    print(f"   Size: {file_size:.1f} KB")
    
    # ƒê·ªçc v√† hi·ªÉn th·ªã th√¥ng tin v·ªÅ d·ªØ li·ªáu
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"   Total documents: {len(data)}")
    print(f"\nüìã Sample document:")
    if len(data) > 0:
        sample = data[0]
        print(f"   Question: {sample['metadata']['question'][:80]}...")
        print(f"   Answer: {sample['metadata']['answer'][:80]}...")
        print(f"   Section: {sample['metadata'].get('section', 'N/A')}")
        print(f"   Source: {sample['metadata'].get('source', 'N/A')}")
else:
    print(f"‚ùå File not found: {json_file}")
    print("   Please make sure you added the dataset with paraphrase_documents_clean.json")

In [None]:
# Test parsing paraphrase_documents_clean.json
import json
from typing import List, Dict

def parse_documents_from_json(file_path: str) -> List[Dict]:
    """Parse documents t·ª´ file paraphrase_documents_clean.json"""
    print(f"üìÇ Reading documents from: {file_path}")
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"   Total entries: {len(data)}")
    
    documents = []
    
    for idx, item in enumerate(data, 1):
        try:
            metadata = item.get('metadata', {})
            
            question = metadata.get('question', '').strip()
            answer = metadata.get('answer', '').strip()
            
            if not question or not answer:
                continue
            
            doc_data = {
                'question': question,
                'answer': answer,
                'section': metadata.get('section', ''),
                'sheet_name': metadata.get('sheet_name', ''),
                'source': metadata.get('source', ''),
                'type': metadata.get('type', ''),
            }
            
            documents.append(doc_data)
            
        except Exception as e:
            print(f"‚ö†Ô∏è Error parsing document {idx}: {e}")
            continue
    
    print(f"‚úÖ Successfully parsed {len(documents)} documents\n")
    return documents

# Test v·ªõi file t·ª´ Kaggle dataset
json_file = "/kaggle/input/vnpt-money-faq-data/paraphrase_documents_clean.json"
documents = parse_documents_from_json(json_file)

print(f"Total documents loaded: {len(documents)}\n")

# Show first 3 documents
for i in range(min(3, len(documents))):
    print(f"Document {i+1}:")
    print(f"  Q: {documents[i]['question'][:60]}...")
    print(f"  A: {documents[i]['answer'][:60]}...")
    print(f"  Section: {documents[i]['section']}")
    print(f"  Sheet: {documents[i]['sheet_name']}")
    print()

In [None]:
%%writefile finetune_mnrl_kaggle.py
"""
Fine-tune v·ªõi MNRL tr√™n Kaggle GPU
"""
import logging
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from typing import List, Dict, Tuple
import os
import json
import random

os.environ["WANDB_DISABLED"] = "true"

logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(name)s:%(message)s')
logger = logging.getLogger(__name__)


def parse_documents_from_json(file_path: str) -> List[Dict]:
    """Parse documents t·ª´ file paraphrase_documents_clean.json"""
    logger.info(f"üìÇ Reading documents from: {file_path}")
    
    if not os.path.exists(file_path):
        logger.error(f"‚ùå File not found: {file_path}")
        return []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    logger.info(f"   Total entries: {len(data)}")
    
    documents = []
    
    for idx, item in enumerate(data, 1):
        try:
            metadata = item.get('metadata', {})
            
            question = metadata.get('question', '').strip()
            answer = metadata.get('answer', '').strip()
            
            if not question or not answer:
                continue
            
            doc_data = {
                'question': question,
                'answer': answer,
                'section': metadata.get('section', ''),
                'sheet_name': metadata.get('sheet_name', ''),
                'source': metadata.get('source', ''),
                'type': metadata.get('type', ''),
            }
            
            documents.append(doc_data)
            
        except Exception as e:
            logger.warning(f"‚ö†Ô∏è Error parsing document {idx}: {e}")
            continue
    
    logger.info(f"‚úÖ Successfully parsed {len(documents)} documents")
    return documents


class MNRLFineTuner:
    def __init__(
        self,
        base_model: str = "keepitreal/vietnamese-sbert",
        output_path: str = "vnpt-sbert-mnrl",
    ):
        self.base_model = base_model
        self.output_path = output_path

        logger.info(f"Loading base model: {base_model}")
        self.model = SentenceTransformer(base_model)

        import torch
        if torch.cuda.is_available():
            logger.info(f"‚úÖ GPU detected: {torch.cuda.get_device_name(0)}")
            logger.info(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        else:
            logger.warning("‚ö†Ô∏è No GPU detected, using CPU")

    def prepare_training_data(
        self, 
        documents: List[Dict],
        validation_split: float = 0.15
    ) -> Tuple[List[InputExample], List[InputExample]]:
        """
        Prepare training and validation data with split
        """
        logger.info(f"\nPreparing MNRL training data from {len(documents)} documents...")
        logger.info(f"Validation split: {validation_split * 100:.0f}%")

        # Shuffle documents
        random.seed(42)
        shuffled_docs = documents.copy()
        random.shuffle(shuffled_docs)
        
        # Calculate split
        split_idx = int(len(shuffled_docs) * (1 - validation_split))
        train_docs = shuffled_docs[:split_idx]
        val_docs = shuffled_docs[split_idx:]
        
        logger.info(f"   Train: {len(train_docs)} documents")
        logger.info(f"   Validation: {len(val_docs)} documents")

        # Create training examples
        training_examples = []
        for doc in train_docs:
            question = doc['question'].strip()
            answer = doc['answer'].strip()
            if question and answer:
                example = InputExample(texts=[question, answer])
                training_examples.append(example)

        # Create validation examples
        validation_examples = []
        for doc in val_docs:
            question = doc['question'].strip()
            answer = doc['answer'].strip()
            if question and answer:
                example = InputExample(texts=[question, answer])
                validation_examples.append(example)

        logger.info(f"‚úÖ Created {len(training_examples)} training examples")
        logger.info(f"‚úÖ Created {len(validation_examples)} validation examples\n")
        
        return training_examples, validation_examples

    def fine_tune(
        self,
        training_examples: List[InputExample],
        validation_examples: List[InputExample] = None,
        epochs: int = 6,
        batch_size: int = 32,
        learning_rate: float = 2e-5,
        warmup_steps: int = None,
    ):
        """
        Fine-tune model with optimized parameters
        """
        # Auto-calculate warmup steps if not provided
        if warmup_steps is None:
            steps_per_epoch = len(training_examples) // batch_size
            total_steps = steps_per_epoch * epochs
            warmup_steps = int(total_steps * 0.1)  # 10% of total steps
        
        logger.info(f"{'='*60}")
        logger.info(f"Starting MNRL fine-tuning")
        logger.info(f"{'='*60}")
        logger.info(f"Training examples: {len(training_examples)}")
        if validation_examples:
            logger.info(f"Validation examples: {len(validation_examples)}")
        logger.info(f"Epochs: {epochs}")
        logger.info(f"Batch size: {batch_size}")
        logger.info(f"Learning rate: {learning_rate}")
        logger.info(f"Warmup steps: {warmup_steps}")
        logger.info(f"{'='*60}\n")

        train_dataloader = DataLoader(
            training_examples,
            shuffle=True,
            batch_size=batch_size
        )

        train_loss = losses.MultipleNegativesRankingLoss(
            model=self.model,
            scale=20.0
        )

        logger.info("Using MultipleNegativesRankingLoss (scale=20.0)\n")

        # Setup evaluator if validation data is provided
        evaluator = None
        if validation_examples:
            # Create evaluator for semantic similarity
            sentences1 = [ex.texts[0] for ex in validation_examples]
            sentences2 = [ex.texts[1] for ex in validation_examples]
            scores = [1.0] * len(validation_examples)  # All pairs are semantically similar
            
            evaluator = evaluation.EmbeddingSimilarityEvaluator(
                sentences1, 
                sentences2, 
                scores,
                name='validation'
            )
            logger.info("‚úÖ Validation evaluator created\n")

        # Fine-tune with optimizer parameters
        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=epochs,
            warmup_steps=warmup_steps,
            optimizer_params={'lr': learning_rate},
            evaluator=evaluator,
            evaluation_steps=500,  # Evaluate every 500 steps
            output_path=self.output_path,
            save_best_model=True,
            show_progress_bar=True,
        )

        logger.info(f"\n{'='*60}")
        logger.info(f"‚úÖ Fine-tuning completed!")
        logger.info(f"‚úÖ Model saved to: {self.output_path}")
        logger.info(f"{'='*60}\n")


if __name__ == "__main__":
    logger.info("\n" + "="*60)
    logger.info("VNPT MONEY FAQ - FINE-TUNING WITH MNRL")
    logger.info("="*60 + "\n")
    
    # Load t·ª´ Kaggle dataset
    json_file = "/kaggle/input/vnpt-money-faq-data/paraphrase_documents_clean.json"
    
    if not os.path.exists(json_file):
        logger.error(f"‚ùå File not found: {json_file}")
        exit(1)
    
    documents = parse_documents_from_json(json_file)

    if not documents:
        logger.error("‚ùå No documents found!")
        exit(1)

    logger.info(f"\n‚úÖ Successfully loaded {len(documents)} documents")
    
    if documents:
        logger.info("\nüìã Sample document:")
        logger.info(f"   Q: {documents[0]['question'][:80]}...")
        logger.info(f"   A: {documents[0]['answer'][:80]}...")

    finetuner = MNRLFineTuner(
        base_model="keepitreal/vietnamese-sbert",
        output_path="/kaggle/working/vnpt-sbert-mnrl",
    )

    # Prepare data with validation split
    training_examples, validation_examples = finetuner.prepare_training_data(
        documents,
        validation_split=0.15  # 15% for validation
    )
    
    if not training_examples:
        logger.error("‚ùå No training examples created!")
        exit(1)

    # Fine-tune v·ªõi tham s·ªë t·ªëi ∆∞u
    finetuner.fine_tune(
        training_examples=training_examples,
        validation_examples=validation_examples,
        epochs=6,              # TƒÉng t·ª´ 3 l√™n 6
        batch_size=32,         # Gi·ªØ nguy√™n, ph√π h·ª£p GPU T4
        learning_rate=2e-5,    # Th√™m learning rate
        warmup_steps=None,     # Auto-calculate (10% total steps)
    )

    logger.info("="*60)
    logger.info("‚úÖ ALL DONE!")
    logger.info(f"Model saved to: /kaggle/working/vnpt-sbert-mnrl")
    logger.info("Download from Output tab ‚Üí")
    logger.info("="*60)

In [None]:
# Test model ngay tr√™n Kaggle tr∆∞·ªõc khi t·∫£i v·ªÅ
from sentence_transformers import SentenceTransformer, util

model_path = "/kaggle/working/vnpt-sbert-mnrl"

print("="*60)
print("TESTING FINE-TUNED MODEL")
print("="*60)

# Load model
print(f"\nüìÇ Loading model from: {model_path}")
model = SentenceTransformer(model_path)
print("‚úÖ Model loaded!\n")

# Test queries
test_queries = [
    "L√†m sao ƒë·ªÉ n·∫°p ti·ªÅn v√†o VNPT Money?",
    "T√¥i mu·ªën r√∫t ti·ªÅn v·ªÅ ng√¢n h√†ng",
    "Ph√≠ chuy·ªÉn ti·ªÅn l√† bao nhi√™u?",
]

print("üß™ Testing v·ªõi m·ªôt s·ªë c√¢u h·ªèi m·∫´u:")
print("-" * 60)

for query in test_queries:
    embedding = model.encode(query)
    print(f"\nQuery: {query}")
    print(f"Embedding shape: {embedding.shape}")
    print(f"Sample values: {embedding[:5]}")

print("\n" + "="*60)
print("‚úÖ Model ho·∫°t ƒë·ªông t·ªët!")
print("="*60)

In [None]:
# ========================================
# B∆Ø·ªöC CU·ªêI: N√âN V√Ä T·∫¢I MODEL V·ªÄ M√ÅY
# ========================================

import shutil
import os

model_path = "/kaggle/working/vnpt-sbert-mnrl"

print("="*60)
print("üì¶ CHU·∫®N B·ªä T·∫¢I MODEL V·ªÄ M√ÅY LOCAL")
print("="*60)

if not os.path.exists(model_path):
    print(f"\n‚ùå Kh√¥ng t√¨m th·∫•y model t·∫°i {model_path}")
    print("   H√£y ch·∫°y cell fine-tuning tr∆∞·ªõc!\n")
else:
    # N√©n model th√†nh ZIP
    print(f"\nüì¶ ƒêang n√©n model...")
    zip_name = "vnpt-sbert-mnrl"
    
    shutil.make_archive(
        base_name=f"/kaggle/working/{zip_name}",
        format='zip',
        root_dir='/kaggle/working',
        base_dir='vnpt-sbert-mnrl'
    )
    
    zip_file = f"/kaggle/working/{zip_name}.zip"
    size_mb = os.path.getsize(zip_file) / (1024 * 1024)
    
    print(f"‚úÖ ƒê√£ n√©n xong!")
    print(f"   File: {zip_file}")
    print(f"   Size: {size_mb:.1f} MB")
    
    print("\n" + "="*60)
    print("üíæ C√ÅCH T·∫¢I V·ªÄ M√ÅY:")
    print("="*60)
    
    print("\nüìå C√ÅCH 1: T·∫£i t·ª´ Kaggle UI (KHUY·∫æN NGH·ªä)")
    print("-" * 60)
    print("   1. Click v√†o bi·ªÉu t∆∞·ª£ng ‚¨áÔ∏è 'Download' ·ªü g√≥c ph·∫£i tr√™n")
    print("   2. Ch·ªçn file 'vnpt-sbert-mnrl.zip'")
    print("   3. Gi·∫£i n√©n v·ªÅ th∆∞ m·ª•c project c·ªßa b·∫°n")
    
    print("\nüìå C√ÅCH 2: D√πng Kaggle API (T·ª± ƒë·ªông)")
    print("-" * 60)
    print("   Tr√™n m√°y local, ch·∫°y l·ªánh:")
    print("   ")
    print("   pip install kaggle")
    print(f"   kaggle kernels output <your-username>/vnpt-money-finetuning -p ./")
    print()
    print("   (C·∫ßn setup kaggle.json token tr∆∞·ªõc)")
    
    print("\n" + "="*60)
    print("‚úÖ S·∫¥N S√ÄNG T·∫¢I V·ªÄ!")
    print("="*60)