In [None]:
# House Purchase Prediction - Exploratory Data Analysis
# Import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

print("✅ Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

In [None]:
# Load the dataset
print("Loading dataset...")
df = pd.read_csv('../data/global_house_purchase_dataset.csv')

print(f"✅ Dataset loaded successfully!")
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nFirst 5 rows:")
df.head()

Unnamed: 0,property_id,country,city,property_type,furnishing_status,property_size_sqft,price,constructed_year,previous_owners,rooms,...,customer_salary,loan_amount,loan_tenure_years,monthly_expenses,down_payment,emi_to_income_ratio,satisfaction_score,neighbourhood_rating,connectivity_score,decision
0,1,France,Marseille,Farmhouse,Semi-Furnished,991,412935,1989,6,6,...,10745,193949,15,6545,218986,0.16,1,5,6,0
1,2,South Africa,Cape Town,Apartment,Semi-Furnished,1244,224538,1990,4,8,...,16970,181465,20,8605,43073,0.08,9,1,2,0
2,3,South Africa,Johannesburg,Farmhouse,Semi-Furnished,4152,745104,2019,5,2,...,21914,307953,30,2510,437151,0.09,6,8,1,0
3,4,Germany,Frankfurt,Farmhouse,Semi-Furnished,3714,1110959,2008,1,3,...,17980,674720,15,8805,436239,0.33,2,6,6,0
4,5,South Africa,Johannesburg,Townhouse,Fully-Furnished,531,99041,2007,6,3,...,17676,65833,25,8965,33208,0.03,3,3,4,0


# 1. Dataset Overview

Let's understand the structure and basic statistics of our dataset.


In [None]:
# Dataset info
print("="*70)
print("DATASET INFORMATION")
print("="*70)
df.info()


In [None]:
# Basic statistics
print("="*70)
print("DESCRIPTIVE STATISTICS")
print("="*70)
df.describe()


In [None]:
# Check for missing values
print("="*70)
print("MISSING VALUES ANALYSIS")
print("="*70)

missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Percentage': missing_pct
}).sort_values('Missing_Count', ascending=False)

if missing_df['Missing_Count'].sum() == 0:
    print("✅ No missing values found!")
else:
    print(missing_df[missing_df['Missing_Count'] > 0])


In [None]:
# Check for duplicates
print("="*70)
print("DUPLICATE ANALYSIS")
print("="*70)

duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates == 0:
    print("✅ No duplicates found!")
else:
    print(f"⚠️ Found {duplicates} duplicate rows ({duplicates/len(df)*100:.2f}%)")


# 2. Target Variable Analysis

Our target variable is `decision` (0 = Not Buy, 1 = Buy). Let's analyze its distribution.


In [None]:
# Target variable distribution
print("="*70)
print("TARGET VARIABLE DISTRIBUTION")
print("="*70)

target_counts = df['decision'].value_counts()
target_pct = df['decision'].value_counts(normalize=True) * 100

print("\nCounts:")
print(target_counts)
print("\nPercentages:")
print(target_pct)

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
axes[0].bar(['Not Buy (0)', 'Buy (1)'], target_counts.values, color=['#e74c3c', '#2ecc71'])
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('Target Variable Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Decision', fontsize=12)
for i, v in enumerate(target_counts.values):
    axes[0].text(i, v + 1000, f'{v:,}', ha='center', fontweight='bold')

# Pie chart
colors = ['#e74c3c', '#2ecc71']
axes[1].pie(target_counts.values, labels=['Not Buy (0)', 'Buy (1)'], 
            autopct='%1.1f%%', colors=colors, startangle=90)
axes[1].set_title('Target Variable Proportion', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Class imbalance check
imbalance_ratio = target_counts.max() / target_counts.min()
print(f"\n📊 Class Imbalance Ratio: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 3:
    print("⚠️ Significant class imbalance detected. Consider using stratified sampling.")
else:
    print("✅ Classes are relatively balanced.")


# 3. Numerical Features Analysis

Let's analyze the distribution of numerical features.


In [None]:
# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove property_id and decision from numerical
numerical_cols = [col for col in numerical_cols if col not in ['property_id', 'decision']]

print(f"📊 Numerical Features: {len(numerical_cols)}")
print(numerical_cols)
print(f"\n📝 Categorical Features: {len(categorical_cols)}")
print(categorical_cols)


In [None]:
# Distribution of key numerical features
key_features = ['property_size_sqft', 'price', 'customer_salary', 
                'loan_amount', 'monthly_expenses', 'down_payment']

fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(key_features):
    axes[idx].hist(df[col], bins=50, color='steelblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].axvline(df[col].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df[col].mean():.0f}')
    axes[idx].axvline(df[col].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {df[col].median():.0f}')
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Box plots to detect outliers
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(key_features):
    axes[idx].boxplot(df[col], vert=True)
    axes[idx].set_title(f'Box Plot: {col}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col, fontsize=10)
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("📊 Box plots show the spread and potential outliers in key features.")


# 4. Categorical Features Analysis

Analyze the distribution of categorical variables.


In [None]:
# Categorical features distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.ravel()

for idx, col in enumerate(categorical_cols):
    value_counts = df[col].value_counts()
    
    # Show top 10 if too many categories
    if len(value_counts) > 10:
        value_counts = value_counts.head(10)
        title_suffix = " (Top 10)"
    else:
        title_suffix = ""
    
    axes[idx].barh(range(len(value_counts)), value_counts.values, color='coral')
    axes[idx].set_yticks(range(len(value_counts)))
    axes[idx].set_yticklabels(value_counts.index)
    axes[idx].set_title(f'{col} Distribution{title_suffix}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Count', fontsize=10)
    axes[idx].grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(value_counts.values):
        axes[idx].text(v + 100, i, f'{v:,}', va='center')

plt.tight_layout()
plt.show()

# Print unique counts
print("\n📊 Unique Values in Categorical Features:")
for col in categorical_cols:
    print(f"  {col}: {df[col].nunique()} unique values")


# 5. Correlation Analysis

Understand relationships between numerical features.


In [None]:
# Correlation matrix
correlation_matrix = df[numerical_cols + ['decision']].corr()

# Plot correlation heatmap
plt.figure(figsize=(16, 14))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            fmt='.2f', linewidths=0.5, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix - Numerical Features', fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("🔥 Red indicates positive correlation, 🧊 Blue indicates negative correlation")


In [None]:
# Top correlations with target variable
target_corr = correlation_matrix['decision'].sort_values(ascending=False)
print("="*70)
print("TOP CORRELATIONS WITH TARGET (decision)")
print("="*70)
print(target_corr)

# Visualize top correlations
top_corr = target_corr[target_corr.index != 'decision'].head(10)

plt.figure(figsize=(10, 6))
colors = ['green' if x > 0 else 'red' for x in top_corr.values]
plt.barh(range(len(top_corr)), top_corr.values, color=colors, alpha=0.7)
plt.yticks(range(len(top_corr)), top_corr.index)
plt.xlabel('Correlation with Decision', fontsize=12)
plt.title('Top 10 Features Correlated with Purchase Decision', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()


# 6. Feature Relationships with Target

Analyze how features differ between buyers and non-buyers.


In [None]:
# Compare key features between buyers and non-buyers
comparison_features = ['customer_salary', 'price', 'loan_amount', 'down_payment', 
                      'monthly_expenses', 'property_size_sqft']

fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(comparison_features):
    # Box plot for each decision class
    data_to_plot = [df[df['decision']==0][col], df[df['decision']==1][col]]
    bp = axes[idx].boxplot(data_to_plot, labels=['Not Buy (0)', 'Buy (1)'], patch_artist=True)
    
    # Color the boxes
    bp['boxes'][0].set_facecolor('#e74c3c')
    bp['boxes'][1].set_facecolor('#2ecc71')
    
    axes[idx].set_title(f'{col} by Decision', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col, fontsize=10)
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("📊 Box plots show how features differ between buyers and non-buyers.")


In [None]:
# Statistical comparison between buyers and non-buyers
print("="*70)
print("STATISTICAL COMPARISON: BUYERS VS NON-BUYERS")
print("="*70)

for col in comparison_features:
    buyers = df[df['decision']==1][col]
    non_buyers = df[df['decision']==0][col]
    
    print(f"\n{col}:")
    print(f"  Non-Buyers: Mean={non_buyers.mean():.2f}, Median={non_buyers.median():.2f}, Std={non_buyers.std():.2f}")
    print(f"  Buyers:     Mean={buyers.mean():.2f}, Median={buyers.median():.2f}, Std={buyers.std():.2f}")
    
    # T-test
    t_stat, p_value = stats.ttest_ind(buyers, non_buyers)
    print(f"  T-test p-value: {p_value:.6f}", end="")
    if p_value < 0.05:
        print(" ✅ Significantly different")
    else:
        print(" ❌ Not significantly different")


# 7. Feature Engineering Exploration

Let's explore some potential engineered features.


In [None]:
# Create engineered features
df_temp = df.copy()

# Affordability score
df_temp['affordability_score'] = df_temp['customer_salary'] / df_temp['price']

# Loan to price ratio
df_temp['loan_to_price_ratio'] = df_temp['loan_amount'] / df_temp['price']

# Down payment ratio
df_temp['down_payment_ratio'] = df_temp['down_payment'] / df_temp['price']

# Price per sqft
df_temp['price_per_sqft'] = df_temp['price'] / df_temp['property_size_sqft']

# Property age
df_temp['property_age'] = 2025 - df_temp['constructed_year']

# Risk score
df_temp['risk_score'] = df_temp['crime_cases_reported'] + df_temp['legal_cases_on_property']

print("✅ Engineered features created!")


In [None]:
# Visualize engineered features vs target
engineered_features = ['affordability_score', 'loan_to_price_ratio', 
                       'down_payment_ratio', 'price_per_sqft', 
                       'property_age', 'risk_score']

fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(engineered_features):
    data_to_plot = [df_temp[df_temp['decision']==0][col], 
                    df_temp[df_temp['decision']==1][col]]
    bp = axes[idx].boxplot(data_to_plot, labels=['Not Buy (0)', 'Buy (1)'], patch_artist=True)
    
    bp['boxes'][0].set_facecolor('#e74c3c')
    bp['boxes'][1].set_facecolor('#2ecc71')
    
    axes[idx].set_title(f'{col} by Decision', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col, fontsize=10)
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("📊 Engineered features show strong relationship with purchase decision!")


In [None]:
# Correlation of engineered features with target
engineered_corr = df_temp[engineered_features + ['decision']].corr()['decision'].sort_values(ascending=False)

print("="*70)
print("ENGINEERED FEATURES CORRELATION WITH TARGET")
print("="*70)
print(engineered_corr)

plt.figure(figsize=(10, 6))
colors = ['green' if x > 0 else 'red' for x in engineered_corr[:-1].values]
plt.barh(range(len(engineered_corr)-1), engineered_corr[:-1].values, color=colors, alpha=0.7)
plt.yticks(range(len(engineered_corr)-1), engineered_corr[:-1].index)
plt.xlabel('Correlation with Decision', fontsize=12)
plt.title('Engineered Features Correlation with Purchase Decision', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()


# 8. Categorical Features vs Target

Analyze how categorical features relate to purchase decisions.


In [None]:
# Purchase rate by categorical features
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.ravel()

for idx, col in enumerate(categorical_cols):
    # Calculate purchase rate for each category
    purchase_rate = df.groupby(col)['decision'].mean().sort_values(ascending=False)
    
    # Show top 10 if too many
    if len(purchase_rate) > 10:
        purchase_rate = purchase_rate.head(10)
    
    axes[idx].barh(range(len(purchase_rate)), purchase_rate.values * 100, color='teal')
    axes[idx].set_yticks(range(len(purchase_rate)))
    axes[idx].set_yticklabels(purchase_rate.index)
    axes[idx].set_title(f'Purchase Rate by {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Purchase Rate (%)', fontsize=10)
    axes[idx].grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(purchase_rate.values):
        axes[idx].text(v*100 + 1, i, f'{v*100:.1f}%', va='center')

plt.tight_layout()
plt.show()

print("📊 Purchase rate varies significantly across categories!")


# 9. Key Insights and Recommendations

Summary of findings from the exploratory data analysis.


In [None]:
# Summary insights
print("="*70)
print("KEY INSIGHTS FROM EDA")
print("="*70)

print(f"\n1. DATASET OVERVIEW:")
print(f"   • Total samples: {len(df):,}")
print(f"   • Features: {df.shape[1]}")
print(f"   • No missing values ✅")
print(f"   • No duplicates ✅")

print(f"\n2. TARGET VARIABLE:")
not_buy_pct = (df['decision']==0).sum() / len(df) * 100
buy_pct = (df['decision']==1).sum() / len(df) * 100
print(f"   • Not Buy: {not_buy_pct:.1f}%")
print(f"   • Buy: {buy_pct:.1f}%")
print(f"   • Class imbalance: {max(not_buy_pct, buy_pct) / min(not_buy_pct, buy_pct):.2f}:1")

print(f"\n3. MOST CORRELATED FEATURES WITH TARGET:")
top_3_corr = target_corr[target_corr.index != 'decision'].head(3)
for feat, corr_val in top_3_corr.items():
    print(f"   • {feat}: {corr_val:.3f}")

print(f"\n4. ENGINEERED FEATURES:")
print(f"   • Created 6 new features (affordability, ratios, age, risk)")
print(f"   • Strong predictive power observed")

print(f"\n5. RECOMMENDATIONS:")
print(f"   ✅ Use stratified sampling for train/val/test split")
print(f"   ✅ Include engineered features in modeling")
print(f"   ✅ Consider ensemble methods (Random Forest, Gradient Boosting)")
print(f"   ✅ Feature scaling recommended for numerical features")
print(f"   ✅ Encode categorical variables (Label/One-Hot encoding)")

print("\n" + "="*70)
print("EDA COMPLETE! Ready for modeling.")
print("="*70)
