In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Create output directories
Path('../../data/processed').mkdir(parents=True, exist_ok=True)
Path('../../outputs/figures').mkdir(parents=True, exist_ok=True)
Path('../../models').mkdir(parents=True, exist_ok=True)

print("="*80)
print("üìä TASK 1.6: TRAIN-TEST SPLIT & FEATURE SCALING")
print("="*80)

# Load the final selected features dataset
df = pd.read_csv('../../data/processed/listings_final_selected_features.csv')
print(f"\n‚úÖ Loaded dataset: {df.shape[0]:,} rows √ó {df.shape[1]} columns")

# Display dataset info
print(f"\nDataset columns:")
print(f"   - id: 1 column")
print(f"   - Features: {df.shape[1] - 3} columns")
print(f"   - Targets: 2 columns (value_encoded, value_category)")

# Separate features and targets
print("\n" + "="*80)
print("üîß SEPARATING FEATURES AND TARGETS")
print("="*80)

# Extract ID, features, and targets
ids = df['id']
X = df.drop(['id', 'value_encoded', 'value_category'], axis=1)
y_encoded = df['value_encoded']  # Numeric target (0, 1, 2)
y_category = df['value_category']  # Categorical target (Poor_Value, Fair_Value, Excellent_Value)

print(f"\n‚úÖ Data Separation Complete:")
print(f"   IDs: {ids.shape[0]:,} rows")
print(f"   Features (X): {X.shape[0]:,} rows √ó {X.shape[1]} columns")
print(f"   Target (y_encoded): {y_encoded.shape[0]:,} rows")
print(f"   Target (y_category): {y_category.shape[0]:,} rows")

# Display feature names
print(f"\nüìã Feature List ({X.shape[1]} features):")
for i, col in enumerate(X.columns, 1):
    print(f"   {i:2d}. {col}")

# Display target distribution
print(f"\nüìä Target Distribution:")
print("-" * 80)
for val in sorted(y_encoded.unique()):
    count = (y_encoded == val).sum()
    pct = (count / len(y_encoded)) * 100
    label = ['Poor_Value', 'Fair_Value', 'Excellent_Value'][int(val)]
    print(f"   {val} ({label:15s}): {count:6,} ({pct:5.2f}%)")

# Check for missing values
print(f"\nüîç Data Quality Check:")
print("-" * 80)
missing_X = X.isnull().sum().sum()
missing_y = y_encoded.isnull().sum()
print(f"   Missing values in features (X): {missing_X}")
print(f"   Missing values in target (y): {missing_y}")
print(f"   Data types in X: {X.dtypes.value_counts().to_dict()}")

if missing_X > 0 or missing_y > 0:
    print(f"\n‚ö†Ô∏è Warning: Missing values detected!")
else:
    print(f"   ‚úÖ No missing values - Ready for splitting!")

# Train-Test Split
print("\n" + "="*80)
print("‚úÇÔ∏è TRAIN-TEST SPLIT (80-20)")
print("="*80)

# Perform stratified split to maintain class distribution
X_train, X_test, y_train_encoded, y_test_encoded, y_train_category, y_test_category, ids_train, ids_test = train_test_split(
    X, y_encoded, y_category, ids,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded  # Stratified split to maintain class balance
)

print(f"\n‚úÖ Split Complete:")
print(f"   Training set: {X_train.shape[0]:,} rows ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"   Test set:     {X_test.shape[0]:,} rows ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"   Features:     {X_train.shape[1]} columns")

# Verify stratification
print(f"\nüìä Class Distribution Verification:")
print("-" * 80)
print(f"{'Class':<20s} | {'Original':>12s} | {'Train':>12s} | {'Test':>12s}")
print("-" * 80)

for val in sorted(y_encoded.unique()):
    label = ['Poor_Value', 'Fair_Value', 'Excellent_Value'][int(val)]
    orig_pct = (y_encoded == val).sum() / len(y_encoded) * 100
    train_pct = (y_train_encoded == val).sum() / len(y_train_encoded) * 100
    test_pct = (y_test_encoded == val).sum() / len(y_test_encoded) * 100
    print(f"{label:<20s} | {orig_pct:11.2f}% | {train_pct:11.2f}% | {test_pct:11.2f}%")

print(f"\n‚úÖ Stratification successful - class distributions maintained!")

# Feature Scaling
print("\n" + "="*80)
print("üìè FEATURE SCALING (STANDARDSCALER)")
print("="*80)

print(f"\n‚è≥ Fitting StandardScaler on training data...")
print(f"   Formula: z = (x - Œº) / œÉ")
print(f"   Where: Œº = mean, œÉ = standard deviation")

# Initialize and fit scaler on training data only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use training statistics

# Convert back to DataFrames for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

print(f"\n‚úÖ Scaling Complete:")
print(f"   Training set scaled: {X_train_scaled.shape[0]:,} rows √ó {X_train_scaled.shape[1]} columns")
print(f"   Test set scaled:     {X_test_scaled.shape[0]:,} rows √ó {X_test_scaled.shape[1]} columns")

# Display scaling statistics for first 5 features
print(f"\nüìä Scaling Statistics (First 5 Features):")
print("-" * 100)
print(f"{'Feature':<30s} | {'Original Mean':>15s} | {'Original Std':>15s} | {'Scaled Mean':>15s} | {'Scaled Std':>15s}")
print("-" * 100)

for i, col in enumerate(X.columns[:5]):
    orig_mean = X_train[col].mean()
    orig_std = X_train[col].std()
    scaled_mean = X_train_scaled[col].mean()
    scaled_std = X_train_scaled[col].std()
    print(f"{col:<30s} | {orig_mean:15.4f} | {orig_std:15.4f} | {scaled_mean:15.4f} | {scaled_std:15.4f}")

print(f"\n‚úÖ All features now have mean ‚âà 0 and std ‚âà 1")

# Save the scaler for future use
scaler_path = '../../models/standard_scaler.pkl'
joblib.dump(scaler, scaler_path)
print(f"\nüíæ Saved scaler: models/standard_scaler.pkl")

# Save Processed Datasets
print("\n" + "="*80)
print("üíæ SAVING PROCESSED DATASETS")
print("="*80)

# Create train and test DataFrames with IDs
train_data = pd.DataFrame({
    'id': ids_train.values
})
train_data = pd.concat([train_data, X_train_scaled.reset_index(drop=True)], axis=1)

test_data = pd.DataFrame({
    'id': ids_test.values
})
test_data = pd.concat([test_data, X_test_scaled.reset_index(drop=True)], axis=1)

# Save features (scaled)
train_data.to_csv('../../data/processed/X_train_scaled.csv', index=False)
test_data.to_csv('../../data/processed/X_test_scaled.csv', index=False)
print(f"\n‚úÖ Saved scaled features:")
print(f"   - data/processed/X_train_scaled.csv ({X_train_scaled.shape[0]:,} √ó {train_data.shape[1]})")
print(f"   - data/processed/X_test_scaled.csv ({X_test_scaled.shape[0]:,} √ó {test_data.shape[1]})")

# Save targets (encoded)
y_train_df = pd.DataFrame({
    'id': ids_train.values,
    'value_encoded': y_train_encoded.values,
    'value_category': y_train_category.values
})
y_test_df = pd.DataFrame({
    'id': ids_test.values,
    'value_encoded': y_test_encoded.values,
    'value_category': y_test_category.values
})

y_train_df.to_csv('../../data/processed/y_train.csv', index=False)
y_test_df.to_csv('../../data/processed/y_test.csv', index=False)
print(f"\n‚úÖ Saved targets:")
print(f"   - data/processed/y_train.csv ({y_train_df.shape[0]:,} √ó {y_train_df.shape[1]})")
print(f"   - data/processed/y_test.csv ({y_test_df.shape[0]:,} √ó {y_test_df.shape[1]})")

# Also save unscaled versions for reference
X_train_unscaled = pd.DataFrame({'id': ids_train.values})
X_train_unscaled = pd.concat([X_train_unscaled, X_train.reset_index(drop=True)], axis=1)
X_test_unscaled = pd.DataFrame({'id': ids_test.values})
X_test_unscaled = pd.concat([X_test_unscaled, X_test.reset_index(drop=True)], axis=1)

X_train_unscaled.to_csv('../../data/processed/X_train_unscaled.csv', index=False)
X_test_unscaled.to_csv('../../data/processed/X_test_unscaled.csv', index=False)
print(f"\n‚úÖ Saved unscaled features (for reference):")
print(f"   - data/processed/X_train_unscaled.csv ({X_train_unscaled.shape[0]:,} √ó {X_train_unscaled.shape[1]})")
print(f"   - data/processed/X_test_unscaled.csv ({X_test_unscaled.shape[0]:,} √ó {X_test_unscaled.shape[1]})")

# Generate Visualizations
print("\n" + "="*80)
print("üìä GENERATING VISUALIZATIONS")
print("="*80)

# VISUALIZATION 1: Before vs After Scaling (4 features)
print("\n1Ô∏è‚É£ Creating before/after scaling comparison...")
fig, axes = plt.subplots(2, 4, figsize=(20, 10))

# Select 4 features with different scales for visualization
viz_features = X.columns[:4]

for i, feature in enumerate(viz_features):
    # Before scaling
    axes[0, i].hist(X_train[feature], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
    axes[0, i].set_title(f'Before Scaling\n{feature}', fontsize=11, fontweight='bold')
    axes[0, i].set_xlabel('Value', fontsize=10)
    axes[0, i].set_ylabel('Frequency', fontsize=10)
    axes[0, i].grid(alpha=0.3)
    
    # Add statistics
    mean_val = X_train[feature].mean()
    std_val = X_train[feature].std()
    axes[0, i].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Œº={mean_val:.2f}')
    axes[0, i].legend(fontsize=9)
    
    # After scaling
    axes[1, i].hist(X_train_scaled[feature], bins=30, color='coral', alpha=0.7, edgecolor='black')
    axes[1, i].set_title(f'After Scaling\n{feature}', fontsize=11, fontweight='bold')
    axes[1, i].set_xlabel('Standardized Value', fontsize=10)
    axes[1, i].set_ylabel('Frequency', fontsize=10)
    axes[1, i].grid(alpha=0.3)
    
    # Add statistics
    mean_val = X_train_scaled[feature].mean()
    std_val = X_train_scaled[feature].std()
    axes[1, i].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Œº‚âà{mean_val:.2f}')
    axes[1, i].axvline(mean_val + std_val, color='green', linestyle=':', linewidth=2, label=f'œÉ‚âà{std_val:.2f}')
    axes[1, i].axvline(mean_val - std_val, color='green', linestyle=':', linewidth=2)
    axes[1, i].legend(fontsize=9)

plt.suptitle('Feature Scaling Comparison: Before vs After StandardScaler', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.savefig('../../outputs/figures/scaling_comparison.png', dpi=300, bbox_inches='tight')
plt.close()
print("   ‚úì Saved: outputs/figures/scaling_comparison.png")

# VISUALIZATION 2: Train-Test Split Distribution
print("\n2Ô∏è‚É£ Creating train-test split distribution plot...")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Class distribution
class_labels = ['Poor_Value', 'Fair_Value', 'Excellent_Value']
train_counts = [sum(y_train_encoded == i) for i in range(3)]
test_counts = [sum(y_test_encoded == i) for i in range(3)]

x = np.arange(len(class_labels))
width = 0.35

bars1 = axes[0].bar(x - width/2, train_counts, width, label='Train', color='steelblue', alpha=0.8, edgecolor='black')
bars2 = axes[0].bar(x + width/2, test_counts, width, label='Test', color='coral', alpha=0.8, edgecolor='black')

axes[0].set_xlabel('Value Category', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count', fontsize=12, fontweight='bold')
axes[0].set_title('Train-Test Split: Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(class_labels, fontsize=11)
axes[0].legend(fontsize=11)
axes[0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        axes[0].text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height):,}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Split size comparison
split_labels = ['Training Set\n(80%)', 'Test Set\n(20%)']
split_counts = [len(X_train), len(X_test)]
colors_split = ['steelblue', 'coral']

bars = axes[1].bar(split_labels, split_counts, color=colors_split, alpha=0.8, edgecolor='black', linewidth=2)
axes[1].set_ylabel('Number of Samples', fontsize=12, fontweight='bold')
axes[1].set_title('Train-Test Split: Dataset Size', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

# Add value labels and percentages
for bar, count in zip(bars, split_counts):
    height = bar.get_height()
    pct = (count / len(X)) * 100
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                f'{count:,}\n({pct:.1f}%)', ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.suptitle('Train-Test Split Analysis (80-20 Stratified Split)', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../../outputs/figures/train_test_split_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
print("   ‚úì Saved: outputs/figures/train_test_split_distribution.png")

# VISUALIZATION 3: Feature Scale Comparison (All Features)
print("\n3Ô∏è‚É£ Creating feature scale comparison for all features...")
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Before scaling - show range of values
feature_ranges_before = []
feature_names_short = []
for col in X.columns:
    feature_ranges_before.append([X_train[col].min(), X_train[col].max()])
    # Shorten feature names for display
    short_name = col[:25] + '...' if len(col) > 25 else col
    feature_names_short.append(short_name)

feature_ranges_before = np.array(feature_ranges_before)

# Plot before scaling
y_pos = np.arange(len(X.columns))
axes[0].barh(y_pos, feature_ranges_before[:, 1] - feature_ranges_before[:, 0], 
            left=feature_ranges_before[:, 0], color='steelblue', alpha=0.7, edgecolor='black')
axes[0].set_yticks(y_pos)
axes[0].set_yticklabels(feature_names_short, fontsize=8)
axes[0].set_xlabel('Value Range', fontsize=12, fontweight='bold')
axes[0].set_title('Feature Ranges BEFORE Scaling', fontsize=14, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)
axes[0].invert_yaxis()

# After scaling - show range of values
feature_ranges_after = []
for col in X.columns:
    feature_ranges_after.append([X_train_scaled[col].min(), X_train_scaled[col].max()])

feature_ranges_after = np.array(feature_ranges_after)

# Plot after scaling
axes[1].barh(y_pos, feature_ranges_after[:, 1] - feature_ranges_after[:, 0], 
            left=feature_ranges_after[:, 0], color='coral', alpha=0.7, edgecolor='black')
axes[1].set_yticks(y_pos)
axes[1].set_yticklabels(feature_names_short, fontsize=8)
axes[1].set_xlabel('Standardized Value Range', fontsize=12, fontweight='bold')
axes[1].set_title('Feature Ranges AFTER Scaling', fontsize=14, fontweight='bold')
axes[1].grid(axis='x', alpha=0.3)
axes[1].axvline(0, color='red', linestyle='--', linewidth=2, alpha=0.5)
axes[1].invert_yaxis()

plt.suptitle(f'Feature Scale Comparison: All {len(X.columns)} Features', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('../../outputs/figures/feature_scale_comparison.png', dpi=300, bbox_inches='tight')
plt.close()
print("   ‚úì Saved: outputs/figures/feature_scale_comparison.png")

print("\n‚úÖ All 3 visualizations generated successfully!")

# Final Summary
print("\n" + "="*80)
print("‚úÖ TASK 1.6: TRAIN-TEST SPLIT & SCALING - COMPLETE")
print("="*80)

print("\nüìã SUMMARY REPORT")
print("-" * 80)
print(f"Original dataset: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"Features: {X.shape[1]}")
print(f"Training samples: {X_train.shape[0]:,} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test samples: {X_test.shape[0]:,} ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"Split ratio: 80-20 (stratified)")
print(f"Scaling method: StandardScaler (z-score normalization)")

print(f"\nüìÅ FILES GENERATED")
print("-" * 80)
print("Data Files (data/processed/):")
print(f"   ‚úì X_train_scaled.csv ({X_train_scaled.shape[0]:,} √ó {train_data.shape[1]})")
print(f"   ‚úì X_test_scaled.csv ({X_test_scaled.shape[0]:,} √ó {test_data.shape[1]})")
print(f"   ‚úì y_train.csv ({y_train_df.shape[0]:,} √ó {y_train_df.shape[1]})")
print(f"   ‚úì y_test.csv ({y_test_df.shape[0]:,} √ó {y_test_df.shape[1]})")
print(f"   ‚úì X_train_unscaled.csv (reference)")
print(f"   ‚úì X_test_unscaled.csv (reference)")
print("\nModel Files (models/):")
print("   ‚úì standard_scaler.pkl (fitted StandardScaler)")
print("\nVisualization Files (outputs/figures/):")
print("   ‚úì scaling_comparison.png")
print("   ‚úì train_test_split_distribution.png")
print("   ‚úì feature_scale_comparison.png")

print(f"\n‚úÖ DATA QUALITY CHECK")
print("-" * 80)
print(f"   Missing values in X_train: {X_train_scaled.isnull().sum().sum()}")
print(f"   Missing values in X_test: {X_test_scaled.isnull().sum().sum()}")
print(f"   Missing values in y_train: {y_train_df.isnull().sum().sum()}")
print(f"   Missing values in y_test: {y_test_df.isnull().sum().sum()}")
print(f"   Stratification maintained: ‚úì")
print(f"   Features scaled (mean‚âà0, std‚âà1): ‚úì")
print(f"   Scaler saved for inference: ‚úì")
print(f"   Ready for model training: ‚úì")

print(f"\nüéØ SCALING VERIFICATION")
print("-" * 80)
print(f"   Mean of scaled features (should be ‚âà0): {X_train_scaled.mean().mean():.6f}")
print(f"   Std of scaled features (should be ‚âà1): {X_train_scaled.std().mean():.6f}")



print("\n" + "="*80)
print("‚ú® TASK 1.6 SUCCESSFULLY COMPLETED! ‚ú®")
print("="*80)

üìä TASK 1.6: TRAIN-TEST SPLIT & FEATURE SCALING

‚úÖ Loaded dataset: 19,912 rows √ó 31 columns

Dataset columns:
   - id: 1 column
   - Features: 28 columns
   - Targets: 2 columns (value_encoded, value_category)

üîß SEPARATING FEATURES AND TARGETS

‚úÖ Data Separation Complete:
   IDs: 19,912 rows
   Features (X): 19,912 rows √ó 28 columns
   Target (y_encoded): 19,912 rows
   Target (y_category): 19,912 rows

üìã Feature List (28 features):
    1. fp_score
    2. price_normalized
    3. price
    4. price_per_bathroom
    5. price_per_bedroom
    6. price_per_person
    7. beds
    8. accommodates
    9. bedrooms
   10. estimated_revenue_l365d
   11. value_density
   12. bathrooms_numeric
   13. neighbourhood_target_encoded
   14. property_type_frequency
   15. review_to_capacity_ratio
   16. property_type_label
   17. neighbourhood_frequency
   18. host_portfolio_intensity
   19. host_id
   20. space_efficiency
   21. host_years
   22. estimated_occupancy_l365d
   23. space_per