# VUG Ablation Studies Only - Kaggle

## üß™ M·ª•c ƒë√≠ch
Ch·∫°y **ch·ªâ Ablation Studies** cho VUG model ƒë·ªÉ ph√¢n t√≠ch t·∫ßm quan tr·ªçng c·ªßa t·ª´ng component.

## üìã 5 Ablation Variants
1. **VUG_wo_constrain** - Lo·∫°i b·ªè constraint loss (L_constrain)
2. **VUG_wo_super** - Lo·∫°i b·ªè supervision loss (L_super)
3. **VUG_wo_user_attn** - Lo·∫°i b·ªè user-level attention (Œ±_u^user = 0)
4. **VUG_wo_item_attn** - Lo·∫°i b·ªè item-level attention (Œ±_u^item = 0)  
5. **VUG_full** - Model ƒë·∫ßy ƒë·ªß (baseline)

## ‚è±Ô∏è Th·ªùi gian d·ª± ki·∫øn: 2-3 gi·ªù total

In [None]:
# Quick Setup cho Ablation Studies only
import subprocess
import sys
import os
import time
import json
import gc
from pathlib import Path

# Install essential packages
essential_packages = ["recbole>=1.1.1", "torch>=1.9.0", "pandas>=1.3.0", "matplotlib>=3.4.0"]
for package in essential_packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    print(f"‚úÖ {package}")

print("üì¶ Essential packages installed for ablation studies")

In [None]:
# Setup workspace v√† copy VUG source
import shutil

work_dir = Path("/kaggle/working/VUG_Ablation")
work_dir.mkdir(exist_ok=True)
os.chdir(work_dir)

# Copy VUG source t·ª´ Kaggle input
kaggle_input = Path("/kaggle/input")
vug_dataset = next(kaggle_input.glob("*vug*"), None)

if vug_dataset:
    for item in vug_dataset.iterdir():
        if item.is_file():
            shutil.copy2(item, work_dir)
        elif item.is_dir():
            shutil.copytree(item, work_dir / item.name, dirs_exist_ok=True)
    print("‚úÖ VUG source copied for ablation studies")
else:
    print("‚ùå VUG dataset not found. Make sure to add VUG dataset to notebook inputs")

# Add to Python path
sys.path.insert(0, str(work_dir))

# Check GPU
import torch
has_gpu = torch.cuda.is_available()
print(f"üéÆ GPU Available: {has_gpu}")
if has_gpu:
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    torch.cuda.empty_cache()

In [None]:
# ƒê·ªãnh nghƒ©a 5 Ablation Configurations
ablation_configs = {
    'VUG_wo_constrain': {
        'description': 'üö´ Remove constraint loss (L_constrain)',
        'config': {'gen_weight': 0.0}
    },
    'VUG_wo_super': {
        'description': 'üö´ Remove supervision loss (L_super)', 
        'config': {'gen_weight': 0.0, 'enhance_weight': 0.0}
    },
    'VUG_wo_user_attn': {
        'description': 'üö´ Remove user-level attention (Œ±_u^user = 0)',
        'config': {'user_weight_attn': 0.0}
    },
    'VUG_wo_item_attn': {
        'description': 'üö´ Remove item-level attention (Œ±_u^item = 0)',
        'config': {'user_weight_attn': 1.0}  
    },
    'VUG_full': {
        'description': '‚úÖ Full VUG model (baseline)',
        'config': {}
    }
}

# Kaggle-optimized config
kaggle_config = {
    'train_epochs': ['BOTH:25', 'TARGET:12'],
    'embedding_size': 32,
    'n_layers': 1,
    'train_batch_size': 512,
    'eval_batch_size': 1024,
    'eval_step': 5,
    'stopping_step': 5,
    'learning_rate': 0.001,
    'reg_weight': 1e-3,
    'lambda_source': 0.8,
    'lambda_target': 0.8,
    'drop_rate': 0.2,
    'connect_way': 'concat',
    'is_transfer': True,
    'enhance_mode': 'asrealsource'
}

print("üß™ Ablation Study Configurations:")
for name, info in ablation_configs.items():
    print(f"  {name}: {info['description']}")

# Import v√† verify VUG
try:
    from recbole_cdr.quick_start import run_recbole_cdr
    from recbole_cdr.model.cross_domain_recommender.vug import VUG
    print("\n‚úÖ VUG model imported successfully")
except ImportError as e:
    print(f"\n‚ùå Import failed: {e}")

In [None]:
# RUN ALL 5 ABLATION STUDIES
ablation_results = {}
total_start = time.time()

print("üß™ Starting VUG Ablation Studies")
print("="*60)

for i, (variant_name, variant_info) in enumerate(ablation_configs.items(), 1):
    
    print(f"\nüî¨ [{i}/5] Running {variant_name}")
    print(f"üìù {variant_info['description']}")
    
    start_time = time.time()
    
    try:
        # Merge configs
        full_config = {**kaggle_config, **variant_info['config']}
        
        print("üöÄ Training started...")
        
        # Run VUG v·ªõi config modifications
        result = run_recbole_cdr(
            model='VUG',
            config_file_list=[
                './recbole_cdr/properties/dataset/Amazon.yaml',
                './recbole_cdr/properties/model/VUG.yaml'
            ],
            config_dict=full_config
        )
        
        runtime = time.time() - start_time
        
        # Extract metrics
        metrics = {}
        if 'test_result' in result and 'rec' in result['test_result']:
            test_metrics = result['test_result']['rec']
            for metric in ['HR@10', 'HR@20', 'NDCG@10', 'NDCG@20']:
                if metric in test_metrics:
                    metrics[metric] = float(test_metrics[metric])
        
        # Store results
        ablation_results[variant_name] = {
            'variant': variant_name,
            'description': variant_info['description'],
            'runtime_minutes': runtime/60,
            'metrics': metrics,
            'status': 'success'
        }
        
        print(f"‚úÖ {variant_name} completed in {runtime/60:.1f} minutes")
        print("üìä Results:")
        for metric, value in metrics.items():
            print(f"   {metric}: {value:.4f}")
        
        # Save individual result
        with open(f'{variant_name}_result.json', 'w') as f:
            json.dump(ablation_results[variant_name], f, indent=2)
        
        # Cleanup
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
    except Exception as e:
        print(f"‚ùå {variant_name} failed: {e}")
        ablation_results[variant_name] = {
            'variant': variant_name,
            'runtime_minutes': (time.time() - start_time)/60,
            'error': str(e),
            'status': 'failed'
        }

total_time = time.time() - total_start
successful = sum(1 for r in ablation_results.values() if r['status'] == 'success')

print(f"\nüéâ Ablation Studies Completed!")
print(f"‚è±Ô∏è Total time: {total_time/3600:.2f} hours")
print(f"‚úÖ Successful: {successful}/5 variants")

In [None]:
# CREATE RESULTS TABLE & ANALYSIS
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create results table
table_data = []
for variant_name, result in ablation_results.items():
    if result['status'] == 'success':
        row = {
            'Variant': variant_name,
            'Description': result['description'],
            'Runtime (min)': result['runtime_minutes'],
            **result['metrics']
        }
        table_data.append(row)

if table_data:
    df = pd.DataFrame(table_data)
    
    print("üìä VUG Ablation Results")
    print("="*70)
    pd.set_option('display.precision', 4)
    print(df.to_string(index=False))
    
    # Save CSV
    df.to_csv('ablation_results.csv', index=False)
    print("\nüíæ Results saved to ablation_results.csv")
    
    # Component importance analysis
    if 'VUG_full' in df['Variant'].values:
        print(f"\nüîç Component Importance Analysis")
        print("-"*40)
        
        baseline = df[df['Variant'] == 'VUG_full'].iloc[0]
        
        importance_scores = []
        for _, row in df.iterrows():
            if row['Variant'] != 'VUG_full':
                # Calculate average performance drop
                drops = []
                for metric in ['HR@10', 'NDCG@10']:
                    if metric in df.columns:
                        drop = ((baseline[metric] - row[metric]) / baseline[metric]) * 100
                        drops.append(drop)
                
                avg_drop = sum(drops) / len(drops) if drops else 0
                component = row['Variant'].replace('VUG_wo_', '').replace('_', ' ').title()
                importance_scores.append((component, avg_drop))
                
                print(f"{component}: {avg_drop:+.2f}% avg drop")
        
        # Rank components by importance
        importance_scores.sort(key=lambda x: x[1], reverse=True)
        print(f"\nüèÜ Component Importance Ranking:")
        for i, (component, score) in enumerate(importance_scores, 1):
            print(f"  {i}. {component}: {score:.2f}% impact")

else:
    print("‚ö†Ô∏è No successful results to analyze")

In [None]:
# CREATE VISUALIZATION
if 'df' in locals() and len(df) > 1:
    
    # Performance comparison plot
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('VUG Ablation Study - Component Impact Analysis', fontsize=16, fontweight='bold')
    
    metrics = ['HR@10', 'HR@20', 'NDCG@10', 'NDCG@20']
    
    for i, metric in enumerate(metrics):
        ax = axes[i//2, i%2]
        
        if metric in df.columns:
            # Clean variant names for display
            display_names = [v.replace('VUG_wo_', 'w/o ').replace('VUG_', '').replace('_', ' ') for v in df['Variant']]
            
            # Color bars: red for ablated, green for full
            colors = ['red' if 'wo_' in v else 'green' for v in df['Variant']]
            
            bars = ax.bar(display_names, df[metric], color=colors, alpha=0.7)
            
            # Add value labels
            for bar, value in zip(bars, df[metric]):
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                       f'{value:.3f}', ha='center', va='bottom', fontsize=9)
            
            ax.set_title(f'{metric} Comparison', fontweight='bold')
            ax.set_ylabel(metric)
            ax.tick_params(axis='x', rotation=45, labelsize=9)
            ax.grid(axis='y', alpha=0.3)
            
            # Highlight full model
            if 'VUG_full' in list(df['Variant']):
                full_idx = list(df['Variant']).index('VUG_full')
                bars[full_idx].set_color('darkgreen')
                bars[full_idx].set_alpha(1.0)
    
    plt.tight_layout()
    plt.savefig('ablation_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("üìà Visualization saved to ablation_comparison.png")

else:
    print("üìä Insufficient data for visualization")

In [None]:
# PACKAGE & DOWNLOAD RESULTS
import zipfile
from datetime import datetime

# Create comprehensive results package
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"VUG_Ablation_Results_{timestamp}.zip"

# Create detailed report
report = f"""VUG Ablation Study Results
{'='*40}

Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Platform: Kaggle
Total Runtime: {(time.time() - total_start)/3600:.2f} hours
Successful Variants: {successful}/5

Ablation Results:
{'-'*20}
"""

for variant_name, result in ablation_results.items():
    if result['status'] == 'success':
        report += f"\n{variant_name}:\n"
        report += f"  {result['description']}\n"
        report += f"  Runtime: {result['runtime_minutes']:.1f} min\n"
        for metric, value in result['metrics'].items():
            report += f"  {metric}: {value:.4f}\n"

# Save report
with open('ablation_report.txt', 'w') as f:
    f.write(report)

# Create zip package
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    
    # Add individual results
    for variant_name in ablation_results:
        json_file = f'{variant_name}_result.json'
        if Path(json_file).exists():
            zipf.write(json_file)
    
    # Add summary files
    if Path('ablation_results.csv').exists():
        zipf.write('ablation_results.csv')
    
    if Path('ablation_comparison.png').exists():
        zipf.write('ablation_comparison.png')
    
    zipf.write('ablation_report.txt')

print("üì¶ Results Package Created!")
print("="*40)
print(f"üìÅ Package: {zip_filename}")
print(f"üìä Contains: JSON results, CSV table, plots, report")
print(f"‚úÖ Successful experiments: {successful}/5")
print(f"‚è±Ô∏è Total runtime: {(time.time() - total_start)/3600:.2f} hours")
print(f"\nüí° Click on '{zip_filename}' above to download results")