<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [2]</a>'.</span>

## 1. Setup & Import Libraries

In [1]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml
import warnings
import sys

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Sklearn
from sklearn.model_selection import train_test_split

# Import custom modules
from src.data.loader import load_raw_data, get_data_info
from src.data.cleaner import (
    clean_data, 
    drop_leakage_columns,
    handle_missing_values,
    handle_outliers,
    handle_adr_outliers,
    get_missing_summary,
    save_artifacts,
    encode_categorical,
    scale_numerical
)
from src.features.builder import (
    create_all_features,
    prepare_for_association_rules,
    get_feature_list
)

# Settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

# Plot settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

print("‚úÖ Libraries imported successfully!")
print(f"üìÅ Project root: {project_root}")

‚úÖ Libraries imported successfully!
üìÅ Project root: C:\Coding\DataMining


## 2. Load Configuration & Data

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [2]:
# Load configuration
config_path = project_root / 'configs' / 'params.yaml'
with open(config_path, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

# Extract parameters
SEED = config['seed']
TEST_SIZE = config['split']['test_size']
TARGET = config['target']

print(f"‚öôÔ∏è Configuration loaded:")
print(f"   - Random Seed: {SEED}")
print(f"   - Test Size: {TEST_SIZE}")
print(f"   - Target Column: {TARGET}")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Coding\\DataMining\\configs\\params.yaml'

In [None]:
# Load raw data
data_path = project_root / 'data' / 'raw' / 'hotel_bookings.csv'
df_raw = pd.read_csv(data_path)

print(f"üìä Raw Data Loaded:")
print(f"   Shape: {df_raw.shape}")
print(f"   Memory Usage: {df_raw.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 3. Th·ªëng K√™ Tr∆∞·ªõc Ti·ªÅn X·ª≠ L√Ω

Tr∆∞·ªõc khi x·ª≠ l√Ω, h√£y xem t√¨nh tr·∫°ng d·ªØ li·ªáu hi·ªán t·∫°i.

In [None]:
# Overview before cleaning
print("="*70)
print("üìã TH·ªêNG K√ä TR∆Ø·ªöC TI·ªÄN X·ª¨ L√ù")
print("="*70)

print(f"\nüìä Shape: {df_raw.shape[0]:,} rows √ó {df_raw.shape[1]} columns")

# Target distribution
target_dist = df_raw[TARGET].value_counts()
target_pct = df_raw[TARGET].value_counts(normalize=True) * 100
print(f"\nüéØ Target Distribution ({TARGET}):")
print(f"   - Not Canceled (0): {target_dist[0]:,} ({target_pct[0]:.2f}%)")
print(f"   - Canceled (1): {target_dist[1]:,} ({target_pct[1]:.2f}%)")
print(f"   ‚Üí Imbalance Ratio: {target_dist[0]/target_dist[1]:.2f}:1")

# Missing values
missing_total = df_raw.isnull().sum().sum()
print(f"\n‚ùå Missing Values Total: {missing_total:,}")

In [None]:
# Detailed missing values
missing_summary = get_missing_summary(df_raw)
if len(missing_summary) > 0:
    print("\nüìã Chi ti·∫øt Missing Values:")
    display(missing_summary.style.format({'Missing_Pct': '{:.2f}%'}))
else:
    print("\n‚úÖ Kh√¥ng c√≥ missing values!")

## 4. Ti·ªÅn X·ª≠ L√Ω D·ªØ Li·ªáu

### 4.1. Lo·∫°i b·ªè Data Leakage Columns

‚ö†Ô∏è **QUAN TR·ªåNG:** C√°c c·ªôt `reservation_status` v√† `reservation_status_date` ch·ª©a th√¥ng tin v·ªÅ k·∫øt qu·∫£ ƒë·∫∑t ph√≤ng (Check-Out/Canceled/No-Show) - ƒë√¢y l√† th√¥ng tin **sau khi** booking ƒë√£ k·∫øt th√∫c, g√¢y ra **data leakage**.

In [None]:
# Step 1: Drop leakage columns
print("üîí B∆Ø·ªöC 1: Lo·∫°i b·ªè Data Leakage Columns")
print("-"*50)

# Check leakage columns before dropping
leakage_cols = ['reservation_status', 'reservation_status_date']
print("\n‚ö†Ô∏è C√°c c·ªôt g√¢y Data Leakage:")
for col in leakage_cols:
    if col in df_raw.columns:
        print(f"   - {col}: {df_raw[col].nunique()} unique values")
        if col == 'reservation_status':
            print(f"     Values: {df_raw[col].value_counts().to_dict()}")

# Drop leakage columns
df_step1 = drop_leakage_columns(df_raw, leakage_cols, verbose=True)
print(f"\n‚úÖ Shape sau khi drop: {df_step1.shape}")

### 4.2. X·ª≠ L√Ω Missing Values

In [None]:
# Step 2: Handle missing values
print("üîß B∆Ø·ªöC 2: X·ª≠ l√Ω Missing Values")
print("-"*50)

# Use automatic strategy (defined in cleaner.py)
df_step2 = handle_missing_values(df_step1, strategy='auto', verbose=True)

# Verify no missing values remain
remaining_missing = df_step2.isnull().sum().sum()
print(f"\n‚úÖ Missing values c√≤n l·∫°i: {remaining_missing}")

### 4.3. X·ª≠ L√Ω Outliers

In [None]:
# Step 3: Handle outliers
print("üìà B∆Ø·ªöC 3: X·ª≠ l√Ω Outliers")
print("-"*50)

# Special handling for ADR (Average Daily Rate)
print("\nüè∑Ô∏è ADR Statistics Before:")
print(f"   Min: {df_step2['adr'].min():.2f}")
print(f"   Max: {df_step2['adr'].max():.2f}")
print(f"   Mean: {df_step2['adr'].mean():.2f}")
print(f"   Negative values: {(df_step2['adr'] < 0).sum()}")

# Handle ADR outliers
df_step3a = handle_adr_outliers(df_step2, min_adr=0, max_adr=5000, verbose=True)

# Handle other outliers using IQR method with capping
df_step3 = handle_outliers(
    df_step3a,
    method='iqr',
    threshold=1.5,
    strategy='cap',
    verbose=True
)

print(f"\n‚úÖ Shape sau khi x·ª≠ l√Ω outliers: {df_step3.shape}")

## 5. Feature Engineering

T·∫°o c√°c features m·ªõi t·ª´ d·ªØ li·ªáu ƒë√£ l√†m s·∫°ch.

In [None]:
# Apply all feature engineering
df_features = create_all_features(df_step3, config_path=str(config_path), verbose=True)

In [None]:
# List all new features
feature_list = get_feature_list()

print("\nüìã DANH S√ÅCH FEATURES M·ªöI:")
print("="*50)
for category, features in feature_list.items():
    print(f"\nüîπ {category.upper()}:")
    for feat in features:
        if feat in df_features.columns:
            print(f"   ‚úì {feat}")
        else:
            print(f"   ‚úó {feat} (not created)")

In [None]:
# Quick statistics of new features
new_features = [
    'total_guests', 'total_nights', 'is_family',
    'is_summer', 'is_peak_season', 
    'has_canceled_before', 'is_returning_customer',
    'room_type_changed', 'deposit_required'
]

print("\nüìä TH·ªêNG K√ä FEATURES M·ªöI:")
print("="*60)

for feat in new_features:
    if feat in df_features.columns:
        if df_features[feat].dtype in ['int64', 'float64']:
            if df_features[feat].nunique() <= 2:  # Binary feature
                pct = df_features[feat].mean() * 100
                print(f"{feat:30s}: {pct:6.2f}% = 1")
            else:
                mean = df_features[feat].mean()
                print(f"{feat:30s}: mean = {mean:.2f}")

## 6. Th·ªëng K√™ Sau Ti·ªÅn X·ª≠ L√Ω

In [None]:
# Comparison: Before vs After
print("="*70)
print("üìä SO S√ÅNH TR∆Ø·ªöC - SAU TI·ªÄN X·ª¨ L√ù")
print("="*70)

comparison = pd.DataFrame({
    'Metric': ['Rows', 'Columns', 'Missing Values', 'Memory (MB)'],
    'Before': [
        f"{df_raw.shape[0]:,}",
        df_raw.shape[1],
        f"{df_raw.isnull().sum().sum():,}",
        f"{df_raw.memory_usage(deep=True).sum() / 1024**2:.2f}"
    ],
    'After': [
        f"{df_features.shape[0]:,}",
        df_features.shape[1],
        f"{df_features.isnull().sum().sum():,}",
        f"{df_features.memory_usage(deep=True).sum() / 1024**2:.2f}"
    ]
})

display(comparison)

In [None]:
# Columns removed and added
removed_cols = set(df_raw.columns) - set(df_features.columns)
added_cols = set(df_features.columns) - set(df_raw.columns)

print(f"\nüóëÔ∏è Columns REMOVED ({len(removed_cols)}):")
for col in sorted(removed_cols):
    print(f"   - {col}")

print(f"\n‚ûï Columns ADDED ({len(added_cols)}):")
for col in sorted(added_cols):
    print(f"   + {col}")

## 7. Train/Test Split

In [None]:
# Separate features and target
X = df_features.drop(columns=[TARGET])
y = df_features[TARGET]

print(f"üìä Features shape: {X.shape}")
print(f"üéØ Target shape: {y.shape}")
print(f"\nüéØ Target distribution:")
print(y.value_counts())

In [None]:
# Train/Test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=SEED,
    stratify=y
)

print(f"üìä TRAIN/TEST SPLIT (Stratified)")
print("="*50)
print(f"\nüîπ Training Set:")
print(f"   X_train: {X_train.shape}")
print(f"   y_train: {y_train.shape}")
print(f"   Canceled ratio: {y_train.mean()*100:.2f}%")

print(f"\nüîπ Test Set:")
print(f"   X_test: {X_test.shape}")
print(f"   y_test: {y_test.shape}")
print(f"   Canceled ratio: {y_test.mean()*100:.2f}%")

## 8. X·ª≠ L√Ω Imbalance (SMOTE)

Dataset c√≥ t·ª∑ l·ªá hu·ª∑ ~37%, kh√¥ng qu√° nghi√™m tr·ªçng nh∆∞ng v·∫´n n√™n x·ª≠ l√Ω.

In [None]:
# Try to import SMOTE
try:
    from imblearn.over_sampling import SMOTE
    SMOTE_AVAILABLE = True
    print("‚úÖ SMOTE is available")
except ImportError:
    SMOTE_AVAILABLE = False
    print("‚ö†Ô∏è imbalanced-learn not installed. Run: pip install imbalanced-learn")
    print("   Skipping SMOTE. Will use class_weight instead.")

In [None]:
# Select only numerical columns for SMOTE
# SMOTE requires numerical data
numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"üìä Numerical columns: {len(numerical_cols)}")
print(f"üìä Categorical columns: {len(categorical_cols)}")

if categorical_cols:
    print(f"\n‚ö†Ô∏è Categorical columns found: {categorical_cols[:5]}...")
    print("   Need to encode before SMOTE")

In [None]:
# Encode categorical for SMOTE (if any)
if categorical_cols:
    # One-hot encode categorical columns
    X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
    X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
    
    # Align columns (ensure same columns in train and test)
    missing_cols = set(X_train_encoded.columns) - set(X_test_encoded.columns)
    for col in missing_cols:
        X_test_encoded[col] = 0
    
    extra_cols = set(X_test_encoded.columns) - set(X_train_encoded.columns)
    X_test_encoded = X_test_encoded.drop(columns=list(extra_cols))
    
    # Ensure same column order
    X_test_encoded = X_test_encoded[X_train_encoded.columns]
    
    print(f"‚úÖ Encoded shapes:")
    print(f"   X_train_encoded: {X_train_encoded.shape}")
    print(f"   X_test_encoded: {X_test_encoded.shape}")
else:
    X_train_encoded = X_train
    X_test_encoded = X_test

In [None]:
# Apply SMOTE if available
if SMOTE_AVAILABLE:
    print("üîÑ Applying SMOTE...")
    print("-"*50)
    
    # Drop non-numeric columns for SMOTE
    X_train_numeric = X_train_encoded.select_dtypes(include=[np.number])
    
    # Handle any remaining missing values
    X_train_numeric = X_train_numeric.fillna(X_train_numeric.median())
    
    smote = SMOTE(random_state=SEED)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_numeric, y_train)
    
    print(f"\nüìä Before SMOTE:")
    print(f"   X_train: {X_train_numeric.shape}")
    print(f"   Class 0: {(y_train == 0).sum():,}")
    print(f"   Class 1: {(y_train == 1).sum():,}")
    
    print(f"\nüìä After SMOTE:")
    print(f"   X_train_resampled: {X_train_resampled.shape}")
    print(f"   Class 0: {(y_train_resampled == 0).sum():,}")
    print(f"   Class 1: {(y_train_resampled == 1).sum():,}")
else:
    X_train_resampled = X_train_encoded.select_dtypes(include=[np.number])
    y_train_resampled = y_train
    print("‚ö†Ô∏è SMOTE skipped. Using original data.")

## 9. Chu·∫©n B·ªã Data cho Association Rules

In [None]:
# Prepare data for association rules
df_association = prepare_for_association_rules(df_features, verbose=True)

print(f"\nüìä Association Rules Data:")
print(f"   Shape: {df_association.shape}")
print(f"   Sample columns: {list(df_association.columns)[:10]}")

## 10. L∆∞u D·ªØ Li·ªáu ƒê√£ X·ª≠ L√Ω

In [None]:
# Create processed data directory
processed_dir = project_root / 'data' / 'processed'
processed_dir.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Saving processed data to: {processed_dir}")
print("="*50)

In [None]:
# Save all datasets

# 1. Full processed data (with features, before encoding)
df_features.to_csv(processed_dir / 'hotel_bookings_processed.csv', index=False)
print(f"‚úÖ Saved: hotel_bookings_processed.csv ({df_features.shape})")

# 2. Train set (original)
train_df = pd.concat([X_train, y_train], axis=1)
train_df.to_csv(processed_dir / 'train.csv', index=False)
print(f"‚úÖ Saved: train.csv ({train_df.shape})")

# 3. Test set (original)
test_df = pd.concat([X_test, y_test], axis=1)
test_df.to_csv(processed_dir / 'test.csv', index=False)
print(f"‚úÖ Saved: test.csv ({test_df.shape})")

# 4. Train set encoded (for modeling)
X_train_encoded.to_csv(processed_dir / 'X_train_encoded.csv', index=False)
y_train.to_csv(processed_dir / 'y_train.csv', index=False)
print(f"‚úÖ Saved: X_train_encoded.csv ({X_train_encoded.shape})")
print(f"‚úÖ Saved: y_train.csv ({y_train.shape})")

# 5. Test set encoded (for modeling)
X_test_encoded.to_csv(processed_dir / 'X_test_encoded.csv', index=False)
y_test.to_csv(processed_dir / 'y_test.csv', index=False)
print(f"‚úÖ Saved: X_test_encoded.csv ({X_test_encoded.shape})")
print(f"‚úÖ Saved: y_test.csv ({y_test.shape})")

# 6. Resampled training data (if SMOTE was applied)
if SMOTE_AVAILABLE:
    X_train_resampled.to_csv(processed_dir / 'X_train_resampled.csv', index=False)
    pd.Series(y_train_resampled, name=TARGET).to_csv(processed_dir / 'y_train_resampled.csv', index=False)
    print(f"‚úÖ Saved: X_train_resampled.csv ({X_train_resampled.shape})")
    print(f"‚úÖ Saved: y_train_resampled.csv ({len(y_train_resampled)},)")

# 7. Association rules data
df_association.to_csv(processed_dir / 'association_rules_data.csv', index=False)
print(f"‚úÖ Saved: association_rules_data.csv ({df_association.shape})")

## 11. Summary

In [None]:
print("\n" + "="*70)
print("üìù SUMMARY - TI·ªÄN X·ª¨ L√ù & FEATURE ENGINEERING")
print("="*70)

print("\n‚úÖ TI·ªÄN X·ª¨ L√ù HO√ÄN TH√ÄNH:")
print(f"   1. Lo·∫°i b·ªè Data Leakage columns: {list(removed_cols & set(leakage_cols))}")
print(f"   2. X·ª≠ l√Ω Missing values: {missing_total:,} ‚Üí 0")
print(f"   3. X·ª≠ l√Ω Outliers: IQR method v·ªõi capping")

print(f"\n‚úÖ FEATURE ENGINEERING:")
print(f"   - Columns ban ƒë·∫ßu: {df_raw.shape[1]}")
print(f"   - Columns sau x·ª≠ l√Ω: {df_features.shape[1]}")
print(f"   - Features m·ªõi: {len(added_cols)}")

print(f"\n‚úÖ TRAIN/TEST SPLIT:")
print(f"   - Train: {len(y_train):,} samples ({(1-TEST_SIZE)*100:.0f}%)")
print(f"   - Test: {len(y_test):,} samples ({TEST_SIZE*100:.0f}%)")

if SMOTE_AVAILABLE:
    print(f"\n‚úÖ SMOTE RESAMPLING:")
    print(f"   - Before: {len(y_train):,} samples")
    print(f"   - After: {len(y_train_resampled):,} samples")

print(f"\n‚úÖ FILES ƒê√É L∆ØU:")
for f in processed_dir.glob('*.csv'):
    size_mb = f.stat().st_size / 1024**2
    print(f"   - {f.name}: {size_mb:.2f} MB")

print("\n" + "="*70)
print("üéâ NOTEBOOK HO√ÄN TH√ÄNH!")
print("   Ti·∫øp theo: 03_mining_or_clustering.ipynb")
print("="*70)