# House Price Prediction - Exploratory Data Analysis

This notebook provides a comprehensive exploratory data analysis (EDA) of the synthetic house price dataset.

## Table of Contents
1. [Data Loading and Overview](#data-loading)
2. [Data Quality Assessment](#data-quality)
3. [Univariate Analysis](#univariate)
4. [Bivariate Analysis](#bivariate)
5. [Correlation Analysis](#correlation)
6. [Feature Engineering Insights](#feature-engineering)
7. [Key Findings](#findings)

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## 1. Data Loading and Overview {#data-loading}

In [None]:
# Load the dataset
data = pd.read_excel('../data/house_data_more_logic.xlsx')

print(f"Dataset shape: {data.shape}")
print(f"\nDataset info:")
data.info()

In [None]:
# Display first few rows
data.head(10)

In [None]:
# Statistical summary
data.describe(include='all')

## 2. Data Quality Assessment {#data-quality}

In [None]:
# Check for missing values
missing_data = data.isnull().sum()
missing_percentage = (missing_data / len(data)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percentage
})

print("Missing Values Analysis:")
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("✅ No missing values found in the dataset!")

In [None]:
# Check for duplicates
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates == 0:
    print("✅ No duplicate rows found!")

## 3. Univariate Analysis {#univariate}

In [None]:
# Analyze target variable (price)
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Histogram
axes[0,0].hist(data['price'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Price Distribution')
axes[0,0].set_xlabel('Price')
axes[0,0].set_ylabel('Frequency')

# Box plot
axes[0,1].boxplot(data['price'])
axes[0,1].set_title('Price Box Plot')
axes[0,1].set_ylabel('Price')

# Q-Q plot
from scipy import stats
stats.probplot(data['price'], dist="norm", plot=axes[1,0])
axes[1,0].set_title('Q-Q Plot (Price vs Normal Distribution)')

# Log-transformed price
log_price = np.log(data['price'])
axes[1,1].hist(log_price, bins=30, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1,1].set_title('Log-Transformed Price Distribution')
axes[1,1].set_xlabel('Log(Price)')
axes[1,1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"Price Statistics:")
print(f"Mean: ${data['price'].mean():,.2f}")
print(f"Median: ${data['price'].median():,.2f}")
print(f"Std: ${data['price'].std():,.2f}")
print(f"Min: ${data['price'].min():,.2f}")
print(f"Max: ${data['price'].max():,.2f}")

In [None]:
# Analyze numerical features
numerical_cols = ['num_rooms', 'surface', 'num_bathrooms', 'age_of_house', 
                  'proximity_to_city_center_km', 'garden_size_sqm']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    axes[i].hist(data[col], bins=20, alpha=0.7, edgecolor='black')
    axes[i].set_title(f'{col.replace("_", " ").title()} Distribution')
    axes[i].set_xlabel(col.replace("_", " ").title())
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Analyze categorical features
categorical_cols = ['has_kitchen', 'neighborhood_quality', 'renovated', 'has_parking', 'property_type']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for i, col in enumerate(categorical_cols):
    if i < len(axes):
        value_counts = data[col].value_counts()
        axes[i].bar(value_counts.index.astype(str), value_counts.values, alpha=0.7)
        axes[i].set_title(f'{col.replace("_", " ").title()} Distribution')
        axes[i].set_xlabel(col.replace("_", " ").title())
        axes[i].set_ylabel('Count')
        
        # Add value labels on bars
        for j, v in enumerate(value_counts.values):
            axes[i].text(j, v + 1, str(v), ha='center', va='bottom')

# Remove empty subplot
fig.delaxes(axes[5])

plt.tight_layout()
plt.show()

## 4. Bivariate Analysis {#bivariate}

In [None]:
# Price vs numerical features
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    axes[i].scatter(data[col], data['price'], alpha=0.6)
    axes[i].set_xlabel(col.replace("_", " ").title())
    axes[i].set_ylabel('Price')
    axes[i].set_title(f'Price vs {col.replace("_", " ").title()}')
    
    # Add trend line
    z = np.polyfit(data[col], data['price'], 1)
    p = np.poly1d(z)
    axes[i].plot(data[col], p(data[col]), "r--", alpha=0.8)

plt.tight_layout()
plt.show()

In [None]:
# Price vs categorical features (box plots)
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(categorical_cols):
    if i < len(axes):
        data.boxplot(column='price', by=col, ax=axes[i])
        axes[i].set_title(f'Price by {col.replace("_", " ").title()}')
        axes[i].set_xlabel(col.replace("_", " ").title())
        axes[i].set_ylabel('Price')

# Remove empty subplot
fig.delaxes(axes[5])

plt.suptitle('')  # Remove automatic title
plt.tight_layout()
plt.show()

## 5. Correlation Analysis {#correlation}

In [None]:
# Correlation matrix for numerical features
numerical_data = data.select_dtypes(include=[np.number])
correlation_matrix = numerical_data.corr()

# Create correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Show correlations with price (sorted)
price_correlations = correlation_matrix['price'].drop('price').sort_values(key=abs, ascending=False)
print("\nCorrelations with Price (sorted by absolute value):")
for feature, corr in price_correlations.items():
    print(f"{feature:30s}: {corr:6.3f}")

## 6. Feature Engineering Insights {#feature-engineering}

In [None]:
# Create some derived features for analysis
data_enhanced = data.copy()

# Price per square meter
data_enhanced['price_per_sqm'] = data_enhanced['price'] / data_enhanced['surface']

# Room density (rooms per square meter)
data_enhanced['room_density'] = data_enhanced['num_rooms'] / data_enhanced['surface']

# Total rooms (bedrooms + bathrooms)
data_enhanced['total_rooms'] = data_enhanced['num_rooms'] + data_enhanced['num_bathrooms']

# Has garden (binary)
data_enhanced['has_garden'] = (data_enhanced['garden_size_sqm'] > 0).astype(int)

# Age categories
data_enhanced['age_category'] = pd.cut(data_enhanced['age_of_house'], 
                                     bins=[0, 10, 25, 50], 
                                     labels=['New', 'Medium', 'Old'])

print("Enhanced features created:")
print("- price_per_sqm: Price per square meter")
print("- room_density: Rooms per square meter")
print("- total_rooms: Total number of rooms")
print("- has_garden: Binary indicator for garden presence")
print("- age_category: Categorical age groups")

In [None]:
# Analyze new features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Price per sqm by property type
data_enhanced.boxplot(column='price_per_sqm', by='property_type', ax=axes[0,0])
axes[0,0].set_title('Price per Sqm by Property Type')
axes[0,0].set_xlabel('Property Type')

# Price by age category
data_enhanced.boxplot(column='price', by='age_category', ax=axes[0,1])
axes[0,1].set_title('Price by Age Category')
axes[0,1].set_xlabel('Age Category')

# Price by garden presence
data_enhanced.boxplot(column='price', by='has_garden', ax=axes[1,0])
axes[1,0].set_title('Price by Garden Presence')
axes[1,0].set_xlabel('Has Garden (0=No, 1=Yes)')

# Room density vs price
axes[1,1].scatter(data_enhanced['room_density'], data_enhanced['price'], alpha=0.6)
axes[1,1].set_xlabel('Room Density')
axes[1,1].set_ylabel('Price')
axes[1,1].set_title('Price vs Room Density')

plt.suptitle('')  # Remove automatic title
plt.tight_layout()
plt.show()

## 7. Key Findings {#findings}

In [None]:
# Summary statistics by key categorical variables
print("=" * 60)
print("KEY FINDINGS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 60)

print("\n1. PRICE DISTRIBUTION:")
print(f"   - Mean price: ${data['price'].mean():,.0f}")
print(f"   - Price range: ${data['price'].min():,.0f} - ${data['price'].max():,.0f}")
print(f"   - Standard deviation: ${data['price'].std():,.0f}")

print("\n2. STRONGEST PRICE PREDICTORS:")
top_correlations = price_correlations.head(5)
for i, (feature, corr) in enumerate(top_correlations.items(), 1):
    print(f"   {i}. {feature.replace('_', ' ').title()}: {corr:.3f}")

print("\n3. PROPERTY TYPE ANALYSIS:")
property_stats = data.groupby('property_type')['price'].agg(['mean', 'count'])
for prop_type, stats in property_stats.iterrows():
    print(f"   - {prop_type}: Avg ${stats['mean']:,.0f} ({stats['count']} properties)")

print("\n4. NEIGHBORHOOD QUALITY IMPACT:")
neighborhood_stats = data.groupby('neighborhood_quality')['price'].mean()
quality_labels = {0: 'Low', 1: 'Medium', 2: 'High'}
for quality, avg_price in neighborhood_stats.items():
    print(f"   - {quality_labels[quality]} Quality: Avg ${avg_price:,.0f}")

print("\n5. RENOVATION IMPACT:")
renovation_stats = data.groupby('renovated')['price'].mean()
renovation_labels = {0: 'Not Renovated', 1: 'Recently Renovated'}
for status, avg_price in renovation_stats.items():
    print(f"   - {renovation_labels[status]}: Avg ${avg_price:,.0f}")
price_diff = renovation_stats[1] - renovation_stats[0]
print(f"   - Renovation premium: ${price_diff:,.0f}")

print("\n6. DATA QUALITY:")
print(f"   - Dataset size: {len(data)} properties")
print(f"   - Missing values: {data.isnull().sum().sum()}")
print(f"   - Duplicate rows: {data.duplicated().sum()}")
print("   - Data quality: Excellent ✅")

print("\n" + "=" * 60)