In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('/content/Bengaluru_house_price_cleaned.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
print(df.describe())

# ==================== DATA PREPROCESSING ====================

# Handle missing values
print("\n" + "="*50)
print("DATA PREPROCESSING")
print("="*50)

# Fill missing values in 'society' with 'Unknown'
df['society'] = df['society'].fillna('Unknown')

# Fill missing bath and balcony with median
df['bath'] = df['bath'].fillna(df['bath'].median())
df['balcony'] = df['balcony'].fillna(df['balcony'].median())

# Handle total_sqft - convert ranges to average
def convert_sqft(x):
    if pd.isna(x):
        return np.nan
    x = str(x).strip()
    if '-' in x:
        parts = x.split('-')
        return (float(parts[0]) + float(parts[1])) / 2
    try:
        return float(x)
    except:
        return np.nan

df['total_sqft'] = df['total_sqft'].apply(convert_sqft)
df['total_sqft'] = df['total_sqft'].fillna(df['total_sqft'].median())

# Extract number of bedrooms from 'size' column
def extract_bhk(x):
    if pd.isna(x):
        return np.nan
    x = str(x).strip()
    if 'BHK' in x or 'RK' in x:
        return int(x.split()[0])
    elif 'Bedroom' in x:
        return int(x.split()[0])
    else:
        return np.nan

df['bhk'] = df['size'].apply(extract_bhk)
df['bhk'] = df['bhk'].fillna(df['bhk'].median())

# Create price per sqft feature
df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']  # Price in lakhs

# Remove outliers based on price per sqft
df = df[(df['price_per_sqft'] >= df['price_per_sqft'].quantile(0.01)) &
        (df['price_per_sqft'] <= df['price_per_sqft'].quantile(0.99))]

# Remove outliers based on bhk and total_sqft relationship
df = df[df['total_sqft']/df['bhk'] >= 300]  # At least 300 sqft per bedroom

print(f"\nDataset shape after cleaning: {df.shape}")

# ==================== FEATURE ENGINEERING ====================

print("\n" + "="*50)
print("FEATURE ENGINEERING")
print("="*50)

# Select important categorical features
categorical_features = ['location', 'area_type', 'availability', 'zone_name']
numerical_features = ['total_sqft', 'bath', 'balcony', 'bhk', 'price_per_sqft']

# Encode categorical variables
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Prepare features and target
feature_cols = [col + '_encoded' for col in categorical_features] + numerical_features
X = df[feature_cols]
y = df['price']

print(f"Features: {X.columns.tolist()}")
print(f"Target: price (in lakhs)")
print(f"Number of samples: {len(X)}")

# ==================== TRAIN-TEST SPLIT ====================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ==================== MODEL TRAINING ====================

print("\n" + "="*50)
print("MODEL TRAINING & EVALUATION")
print("="*50)

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=10),
    'Lasso Regression': Lasso(alpha=1),
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
}

results = []

for name, model in models.items():
    # Train model
    model.fit(X_train_scaled, y_train)

    # Predictions
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)

    # Evaluation metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_mae = mean_absolute_error(y_test, y_pred_test)

    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train,
                                cv=5, scoring='r2')

    results.append({
        'Model': name,
        'Train R¬≤': train_r2,
        'Test R¬≤': test_r2,
        'RMSE': test_rmse,
        'MAE': test_mae,
        'CV R¬≤ (mean)': cv_scores.mean(),
        'CV R¬≤ (std)': cv_scores.std()
    })

    print(f"\n{name}:")
    print(f"  Train R¬≤: {train_r2:.4f}")
    print(f"  Test R¬≤: {test_r2:.4f}")
    print(f"  RMSE: {test_rmse:.2f} lakhs")
    print(f"  MAE: {test_mae:.2f} lakhs")
    print(f"  CV R¬≤ Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Results summary
results_df = pd.DataFrame(results)
print("\n" + "="*50)
print("MODEL COMPARISON SUMMARY")
print("="*50)
print(results_df.to_string(index=False))

# Select best model (highest test R¬≤)
best_model_name = results_df.loc[results_df['Test R¬≤'].idxmax(), 'Model']
best_model = models[best_model_name]
print(f"\nüèÜ Best Model: {best_model_name}")

# ==================== FEATURE IMPORTANCE ====================

if hasattr(best_model, 'feature_importances_'):
    print("\n" + "="*50)
    print("FEATURE IMPORTANCE (Top 10)")
    print("="*50)

    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)

    print(feature_importance.head(10).to_string(index=False))

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    top_features = feature_importance.head(10)
    plt.barh(top_features['Feature'], top_features['Importance'])
    plt.xlabel('Importance')
    plt.title(f'Top 10 Feature Importances - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
    print("\nFeature importance plot saved as 'feature_importance.png'")

# ==================== PREDICTIONS VISUALIZATION ====================

y_pred_final = best_model.predict(X_test_scaled)

plt.figure(figsize=(12, 5))

# Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_final, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
         'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual Price (lakhs)')
plt.ylabel('Predicted Price (lakhs)')
plt.title('Actual vs Predicted Prices')
plt.legend()

# Residuals
plt.subplot(1, 2, 2)
residuals = y_test - y_pred_final
plt.scatter(y_pred_final, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--', lw=2)
plt.xlabel('Predicted Price (lakhs)')
plt.ylabel('Residuals')
plt.title('Residual Plot')

plt.tight_layout()
plt.savefig('predictions.png', dpi=300, bbox_inches='tight')
print("\nPredictions plot saved as 'predictions.png'")

# ==================== SAMPLE PREDICTIONS ====================

print("\n" + "="*50)
print("SAMPLE PREDICTIONS")
print("="*50)

sample_indices = np.random.choice(len(X_test), 5, replace=False)
for idx in sample_indices:
    actual = y_test.iloc[idx]
    predicted = y_pred_final[idx]
    error = abs(actual - predicted)
    print(f"Actual: ‚Çπ{actual:.2f} lakhs | Predicted: ‚Çπ{predicted:.2f} lakhs | Error: ‚Çπ{error:.2f} lakhs")

print("\n" + "="*50)
print("MODEL TRAINING COMPLETE!")
print("="*50)