#  IMPORTANT NOTE

**This task has been updated to work with landlord-controlled features only.**

## What Changed?

**Previous Version (DEPRECATED):**
- Used `listings_final_selected_features.csv` with review-based features 
- Included review_scores_*, number_of_reviews, etc. (data leakage) 
- Achieved unrealistic ~99% accuracy 

**Current Version (UPDATED):**
- Uses `listings_landlord_features_only.csv` from updated Task 1.5 
- Only landlord-controlled features (no review data) 
- Realistic accuracy expected (~65-75%) 
- Model can predict value for NEW listings 

## Why This Matters

The original feature set included review scores and booking history that don't exist when a landlord first posts a listing. This created **circular logic** - using reviews to predict labels derived from reviews.

The updated pipeline ensures our model can make **realistic predictions** for new listings using only information available at posting time.

---

## Task 1.6: Train-Test Split & Feature Scaling (Updated)

The final step of Week 1 prepares our **landlord-controlled features** for modeling. We implement an 80-20 stratified split, ensuring each value category (Poor, Fair, Excellent) maintains its proportional representation in both training and test sets—critical for unbiased model evaluation. StandardScaler transforms our features to zero mean and unit variance, a prerequisite for distance-based algorithms like SVM and gradient-sensitive methods like neural networks. Crucially, we fit the scaler exclusively on training data before transforming the test set, preventing data leakage. Both scaled and unscaled datasets are preserved alongside the serialized scaler, giving us flexibility for tree-based models that don't require scaling while keeping everything reproducible for deployment.

**Key Update:** This task now processes only landlord-controlled features (no review data), ensuring our model can predict value for NEW listings where reviews don't exist yet.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Create output directories
Path('../../data/processed').mkdir(parents=True, exist_ok=True)
Path('../../outputs/figures').mkdir(parents=True, exist_ok=True)
Path('../../models').mkdir(parents=True, exist_ok=True)

print("="*80)
print(" TASK 1.6: TRAIN-TEST SPLIT & FEATURE SCALING (UPDATED)")
print("="*80)
print("\n Using landlord-controlled features only (no review data)")

# Load the landlord-controlled features dataset from updated Task 1.5
df = pd.read_csv('../../data/processed/listings_landlord_features_only.csv')
print(f"\n Loaded dataset: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"   Source: listings_landlord_features_only.csv (from Task 1.5)")

# Display dataset info
print(f"\nDataset composition:")
id_cols = ['id', 'host_id']
available_ids = [col for col in id_cols if col in df.columns]
print(f"   - ID columns: {len(available_ids)} ({', '.join(available_ids)})")
print(f"   - Features: {df.shape[1] - len(available_ids) - 1} columns")
print(f"   - Target: 1 column (value_category)")

# Separate features and targets
print("\n" + "="*80)
print(" SEPARATING FEATURES AND TARGETS")
print("="*80)

# Extract ID, features, and target
ids = df[available_ids] if len(available_ids) > 0 else None
X = df.drop(columns=available_ids + ['value_category'])
y_category = df['value_category']  # Categorical target

# Create encoded version for stratification
category_mapping = {'Excellent_Value': 0, 'Fair_Value': 1, 'Poor_Value': 2}
y_encoded = y_category.map(category_mapping)

print(f"\n Data Separation Complete:")
if ids is not None:
    print(f"   IDs: {ids.shape[0]:,} rows × {ids.shape[1]} columns")
print(f"   Features (X): {X.shape[0]:,} rows × {X.shape[1]} columns")
print(f"   Target (y_category): {y_category.shape[0]:,} rows")
print(f"   Target (y_encoded): {y_encoded.shape[0]:,} rows")

# Display feature names
print(f"\n Feature List ({X.shape[1]} landlord-controlled features):")
for i, col in enumerate(X.columns, 1):
    print(f"   {i:2d}. {col}")

# Display target distribution
print(f"\n Target Distribution:")
print("-" * 80)
for category in sorted(y_category.unique()):
    count = (y_category == category).sum()
    pct = (count / len(y_category)) * 100
    encoded_val = category_mapping[category]
    print(f"   {encoded_val} ({category:15s}): {count:6,} ({pct:5.2f}%)")

# Check for missing values
print(f"\n Data Quality Check:")
print("-" * 80)
missing_X = X.isnull().sum().sum()
missing_y = y_category.isnull().sum()
print(f"   Missing values in features (X): {missing_X}")
print(f"   Missing values in target (y): {missing_y}")
print(f"   Data types in X: {X.dtypes.value_counts().to_dict()}")

if missing_X > 0 or missing_y > 0:
    print(f"\n  Warning: Missing values detected!")
else:
    print(f"\n No missing values - Ready for splitting!")

# Train-Test Split
print("\n" + "="*80)
print("  TRAIN-TEST SPLIT (80-20)")
print("="*80)

# Perform stratified split to maintain class distribution
if ids is not None:
    X_train, X_test, y_train_encoded, y_test_encoded, y_train_category, y_test_category, ids_train, ids_test = train_test_split(
        X, y_encoded, y_category, ids,
        test_size=0.2,
        random_state=42,
        stratify=y_encoded  # Stratified split to maintain class balance
    )
else:
    X_train, X_test, y_train_encoded, y_test_encoded, y_train_category, y_test_category = train_test_split(
        X, y_encoded, y_category,
        test_size=0.2,
        random_state=42,
        stratify=y_encoded
    )
    ids_train = ids_test = None

print(f"\n Split Complete:")
print(f"   Training set: {X_train.shape[0]:,} rows ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"   Test set:     {X_test.shape[0]:,} rows ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"   Features:     {X_train.shape[1]} columns")

# Verify stratification
print(f"\n Class Distribution Verification:")
print("-" * 80)
print(f"{'Class':<20s} | {'Original':>12s} | {'Train':>12s} | {'Test':>12s}")
print("-" * 80)

reverse_mapping = {v: k for k, v in category_mapping.items()}
for val in sorted(y_encoded.unique()):
    label = reverse_mapping[val]
    orig_pct = (y_encoded == val).sum() / len(y_encoded) * 100
    train_pct = (y_train_encoded == val).sum() / len(y_train_encoded) * 100
    test_pct = (y_test_encoded == val).sum() / len(y_test_encoded) * 100
    print(f"{label:<20s} | {orig_pct:11.2f}% | {train_pct:11.2f}% | {test_pct:11.2f}%")

print(f"\n Stratification successful - class distributions maintained!")

# Feature Scaling
print("\n" + "="*80)
print(" FEATURE SCALING (STANDARDSCALER)")
print("="*80)

print(f"\n Fitting StandardScaler on training data...")
print(f"   Formula: z = (x - μ) / σ")
print(f"   Where: μ = mean, σ = standard deviation")

# Initialize and fit scaler on training data only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use training statistics

# Convert back to DataFrames for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

print(f"\n Scaling Complete:")
print(f"   Training set scaled: {X_train_scaled.shape[0]:,} rows × {X_train_scaled.shape[1]} columns")
print(f"   Test set scaled:     {X_test_scaled.shape[0]:,} rows × {X_test_scaled.shape[1]} columns")

# Display scaling statistics for first 5 features
print(f"\n Scaling Statistics (First 5 Features):")
print("-" * 100)
print(f"{'Feature':<30s} | {'Original Mean':>15s} | {'Original Std':>15s} | {'Scaled Mean':>15s} | {'Scaled Std':>15s}")
print("-" * 100)

for i, col in enumerate(X.columns[:5]):
    orig_mean = X_train[col].mean()
    orig_std = X_train[col].std()
    scaled_mean = X_train_scaled[col].mean()
    scaled_std = X_train_scaled[col].std()
    print(f"{col:<30s} | {orig_mean:15.4f} | {orig_std:15.4f} | {scaled_mean:15.4f} | {scaled_std:15.4f}")

print(f"\n✅ All features now have mean ≈ 0 and std ≈ 1")

# Save the scaler for future use
scaler_path = '../../models/scaler_landlord.pkl'
joblib.dump(scaler, scaler_path)
print(f"\n Saved scaler: models/scaler_landlord.pkl")

# Save Processed Datasets
print("\n" + "="*80)
print(" SAVING PROCESSED DATASETS")
print("="*80)

# Save unscaled features
X_train.to_csv('../../data/processed/X_train_landlord.csv', index=False)
X_test.to_csv('../../data/processed/X_test_landlord.csv', index=False)
print(f"\n Saved unscaled features:")
print(f"   - data/processed/X_train_landlord.csv ({X_train.shape[0]:,} × {X_train.shape[1]})")
print(f"   - data/processed/X_test_landlord.csv ({X_test.shape[0]:,} × {X_test.shape[1]})")

# Save scaled features
X_train_scaled.to_csv('../../data/processed/X_train_scaled_landlord.csv', index=False)
X_test_scaled.to_csv('../../data/processed/X_test_scaled_landlord.csv', index=False)
print(f"\n Saved scaled features:")
print(f"   - data/processed/X_train_scaled_landlord.csv ({X_train_scaled.shape[0]:,} × {X_train_scaled.shape[1]})")
print(f"   - data/processed/X_test_scaled_landlord.csv ({X_test_scaled.shape[0]:,} × {X_test_scaled.shape[1]})")

# Save targets
y_train_df = pd.DataFrame({
    'value_encoded': y_train_encoded.values,
    'value_category': y_train_category.values
})
y_test_df = pd.DataFrame({
    'value_encoded': y_test_encoded.values,
    'value_category': y_test_category.values
})

# Add IDs if available
if ids_train is not None:
    for col in ids_train.columns:
        y_train_df.insert(0, col, ids_train[col].values)
        y_test_df.insert(0, col, ids_test[col].values)

y_train_df.to_csv('../../data/processed/y_train_landlord.csv', index=False)
y_test_df.to_csv('../../data/processed/y_test_landlord.csv', index=False)
print(f"\n Saved targets:")
print(f"   - data/processed/y_train_landlord.csv ({y_train_df.shape[0]:,} × {y_train_df.shape[1]})")
print(f"   - data/processed/y_test_landlord.csv ({y_test_df.shape[0]:,} × {y_test_df.shape[1]})")

# Save IDs separately for NLP merging
if ids_train is not None:
    ids_train.to_csv('../../data/processed/train_ids.csv', index=False)
    ids_test.to_csv('../../data/processed/test_ids.csv', index=False)
    print(f"\n Saved IDs for NLP merging:")
    print(f"   - data/processed/train_ids.csv")
    print(f"   - data/processed/test_ids.csv")

# Generate Visualizations
print("\n" + "="*80)
print(" GENERATING VISUALIZATIONS")
print("="*80)

# VISUALIZATION 1: Before vs After Scaling (4 features)
print("\n Creating before/after scaling comparison...")
fig, axes = plt.subplots(2, 4, figsize=(20, 10))

# Select 4 features with different scales for visualization
viz_features = X.columns[:4]

for i, feature in enumerate(viz_features):
    # Before scaling
    axes[0, i].hist(X_train[feature], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
    axes[0, i].set_title(f'Before Scaling\n{feature}', fontsize=11, fontweight='bold')
    axes[0, i].set_xlabel('Value', fontsize=10)
    axes[0, i].set_ylabel('Frequency', fontsize=10)
    axes[0, i].grid(alpha=0.3)
    
    # Add statistics
    mean_val = X_train[feature].mean()
    std_val = X_train[feature].std()
    axes[0, i].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'μ={mean_val:.2f}')
    axes[0, i].legend(fontsize=9)
    
    # After scaling
    axes[1, i].hist(X_train_scaled[feature], bins=30, color='coral', alpha=0.7, edgecolor='black')
    axes[1, i].set_title(f'After Scaling\n{feature}', fontsize=11, fontweight='bold')
    axes[1, i].set_xlabel('Standardized Value', fontsize=10)
    axes[1, i].set_ylabel('Frequency', fontsize=10)
    axes[1, i].grid(alpha=0.3)
    
    # Add statistics
    mean_val = X_train_scaled[feature].mean()
    std_val = X_train_scaled[feature].std()
    axes[1, i].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'μ≈{mean_val:.2f}')
    axes[1, i].axvline(mean_val + std_val, color='green', linestyle=':', linewidth=2, label=f'σ≈{std_val:.2f}')
    axes[1, i].axvline(mean_val - std_val, color='green', linestyle=':', linewidth=2)
    axes[1, i].legend(fontsize=9)

plt.suptitle('Feature Scaling Comparison: Before vs After StandardScaler (Landlord Features)', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.savefig('../../outputs/figures/scaling_comparison_landlord.png', dpi=300, bbox_inches='tight')
plt.close()
print("    Saved: outputs/figures/scaling_comparison_landlord.png")

# VISUALIZATION 2: Train-Test Split Distribution
print("\n Creating train-test split distribution plot...")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Class distribution
class_labels = ['Excellent_Value', 'Fair_Value', 'Poor_Value']
train_counts = [sum(y_train_encoded == i) for i in range(3)]
test_counts = [sum(y_test_encoded == i) for i in range(3)]

x = np.arange(len(class_labels))
width = 0.35

bars1 = axes[0].bar(x - width/2, train_counts, width, label='Train', color='steelblue', alpha=0.8, edgecolor='black')
bars2 = axes[0].bar(x + width/2, test_counts, width, label='Test', color='coral', alpha=0.8, edgecolor='black')

axes[0].set_xlabel('Value Category', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count', fontsize=12, fontweight='bold')
axes[0].set_title('Train-Test Split: Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(class_labels, fontsize=11)
axes[0].legend(fontsize=11)
axes[0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        axes[0].text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height):,}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Split size comparison
split_labels = ['Training Set\n(80%)', 'Test Set\n(20%)']
split_counts = [len(X_train), len(X_test)]
colors_split = ['steelblue', 'coral']

bars = axes[1].bar(split_labels, split_counts, color=colors_split, alpha=0.8, edgecolor='black', linewidth=2)
axes[1].set_ylabel('Number of Samples', fontsize=12, fontweight='bold')
axes[1].set_title('Train-Test Split: Dataset Size', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

# Add value labels and percentages
for bar, count in zip(bars, split_counts):
    height = bar.get_height()
    pct = (count / len(X)) * 100
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                f'{count:,}\n({pct:.1f}%)', ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.suptitle('Train-Test Split Analysis (80-20 Stratified, Landlord Features Only)', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../../outputs/figures/train_test_split_distribution_landlord.png', dpi=300, bbox_inches='tight')
plt.close()
print("    Saved: outputs/figures/train_test_split_distribution_landlord.png")

# VISUALIZATION 3: Feature Scale Comparison (All Features)
print("\n Creating feature scale comparison for all features...")
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Before scaling - show range of values
feature_ranges_before = []
feature_names_short = []
for col in X.columns:
    feature_ranges_before.append([X_train[col].min(), X_train[col].max()])
    # Shorten feature names for display
    short_name = col[:25] + '...' if len(col) > 25 else col
    feature_names_short.append(short_name)

feature_ranges_before = np.array(feature_ranges_before)

# Plot before scaling
y_pos = np.arange(len(X.columns))
axes[0].barh(y_pos, feature_ranges_before[:, 1] - feature_ranges_before[:, 0], 
            left=feature_ranges_before[:, 0], color='steelblue', alpha=0.7, edgecolor='black')
axes[0].set_yticks(y_pos)
axes[0].set_yticklabels(feature_names_short, fontsize=8)
axes[0].set_xlabel('Value Range', fontsize=12, fontweight='bold')
axes[0].set_title('Feature Ranges BEFORE Scaling', fontsize=14, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)
axes[0].invert_yaxis()

# After scaling - show range of values
feature_ranges_after = []
for col in X.columns:
    feature_ranges_after.append([X_train_scaled[col].min(), X_train_scaled[col].max()])

feature_ranges_after = np.array(feature_ranges_after)

# Plot after scaling
axes[1].barh(y_pos, feature_ranges_after[:, 1] - feature_ranges_after[:, 0], 
            left=feature_ranges_after[:, 0], color='coral', alpha=0.7, edgecolor='black')
axes[1].set_yticks(y_pos)
axes[1].set_yticklabels(feature_names_short, fontsize=8)
axes[1].set_xlabel('Standardized Value Range', fontsize=12, fontweight='bold')
axes[1].set_title('Feature Ranges AFTER Scaling', fontsize=14, fontweight='bold')
axes[1].grid(axis='x', alpha=0.3)
axes[1].axvline(0, color='red', linestyle='--', linewidth=2, alpha=0.5)
axes[1].invert_yaxis()

plt.suptitle(f'Feature Scale Comparison: All {len(X.columns)} Landlord-Controlled Features', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('../../outputs/figures/feature_scale_comparison_landlord.png', dpi=300, bbox_inches='tight')
plt.close()
print("    Saved: outputs/figures/feature_scale_comparison_landlord.png")

print("\n All 3 visualizations generated successfully!")

# Final Summary
print("\n" + "="*80)
print(" TASK 1.6: TRAIN-TEST SPLIT & SCALING - COMPLETE")
print("="*80)

print("\n SUMMARY REPORT")
print("-" * 80)
print(f"Original dataset: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"Features: {X.shape[1]} (landlord-controlled only)")
print(f"Training samples: {X_train.shape[0]:,} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test samples: {X_test.shape[0]:,} ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"Split ratio: 80-20 (stratified)")
print(f"Scaling method: StandardScaler (z-score normalization)")

print(f"\n FILES GENERATED")
print("-" * 80)
print("Data Files (data/processed/):")
print(f"    X_train_landlord.csv ({X_train.shape[0]:,} × {X_train.shape[1]})")
print(f"    X_test_landlord.csv ({X_test.shape[0]:,} × {X_test.shape[1]})")
print(f"    X_train_scaled_landlord.csv ({X_train_scaled.shape[0]:,} × {X_train_scaled.shape[1]})")
print(f"    X_test_scaled_landlord.csv ({X_test_scaled.shape[0]:,} × {X_test_scaled.shape[1]})")
print(f"    y_train_landlord.csv ({y_train_df.shape[0]:,} × {y_train_df.shape[1]})")
print(f"    y_test_landlord.csv ({y_test_df.shape[0]:,} × {y_test_df.shape[1]})")
if ids_train is not None:
    print(f"    train_ids.csv (for NLP merging)")
    print(f"    test_ids.csv (for NLP merging)")
print("\nModel Files (models/):")
print("    scaler_landlord.pkl (fitted StandardScaler)")
print("\nVisualization Files (outputs/figures/):")
print("    scaling_comparison_landlord.png")
print("    train_test_split_distribution_landlord.png")
print("    feature_scale_comparison_landlord.png")

print(f"\n DATA QUALITY CHECK")
print("-" * 80)
print(f"    Missing values in X_train: {X_train_scaled.isnull().sum().sum()}")
print(f"    Missing values in X_test: {X_test_scaled.isnull().sum().sum()}")
print(f"    Missing values in y_train: {y_train_df.isnull().sum().sum()}")
print(f"    Missing values in y_test: {y_test_df.isnull().sum().sum()}")
print(f"    Stratification maintained")
print(f"    Features scaled (mean≈0, std≈1)")
print(f"    Scaler saved for inference")
print(f"    Ready for model training")

print(f"\n SCALING VERIFICATION")
print("-" * 80)
print(f"   Mean of scaled features (should be ≈0): {X_train_scaled.mean().mean():.6f}")
print(f"   Std of scaled features (should be ≈1): {X_train_scaled.std().mean():.6f}")

print(f"\n  IMPORTANT NOTE")
print("-" * 80)
print("   This dataset contains ONLY landlord-controlled features.")
print("   Review-based features have been removed to prevent data leakage.")
print("   Expected model accuracy: ~65-75% (realistic for new listings)")
print("   Previous accuracy with reviews: ~99% (unrealistic, circular logic)")

print("\n" + "="*80)
print("  TASK 1.6 SUCCESSFULLY COMPLETED! ")
print("="*80)
