## 2. Data Loading and Exploration

In [None]:
# Load Boston Housing dataset
boston = load_boston()

# Create DataFrame
df = pd.DataFrame(data=boston.data, columns=boston.feature_names)
df['MEDV'] = boston.target

print("🏠 Boston Housing dataset loaded!")
print(f"Dataset shape: {df.shape}")
print(f"Features: {list(boston.feature_names)}")
print(f"Target: MEDV (Median home value in $1000s)")

In [None]:
# Display first few rows
print("🔍 First 5 rows:")
df.head()

In [None]:
# Dataset information
print("📊 Dataset Information:")
print(f"Total houses: {len(df)}")
print(f"Features: {df.shape[1] - 1}")
print(f"Price range: ${df['MEDV'].min():.1f}k - ${df['MEDV'].max():.1f}k")
print(f"Average price: ${df['MEDV'].mean():.1f}k")
print(f"Missing values: {df.isnull().sum().sum()}")

print("\n📈 Statistical Summary:")
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Price distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Histogram
axes[0].hist(df['MEDV'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0].set_title('🏠 Distribution of House Prices', fontweight='bold', fontsize=14)
axes[0].set_xlabel('Price ($1000s)')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df['MEDV'].mean(), color='red', linestyle='--', 
                label=f'Mean: ${df["MEDV"].mean():.1f}k')
axes[0].legend()

# Box plot
axes[1].boxplot(df['MEDV'])
axes[1].set_title('📦 House Price Box Plot', fontweight='bold', fontsize=14)
axes[1].set_ylabel('Price ($1000s)')

plt.tight_layout()
plt.show()

print(f"📊 Price Statistics:")
print(f"• Mean: ${df['MEDV'].mean():.1f}k")
print(f"• Median: ${df['MEDV'].median():.1f}k")
print(f"• Std Dev: ${df['MEDV'].std():.1f}k")
print(f"• Skewness: {df['MEDV'].skew():.2f}")

In [None]:
# Feature correlations with price
plt.figure(figsize=(12, 8))
correlations = df.corr()['MEDV'].sort_values(key=abs, ascending=False)[1:]

# Create color map
colors = ['red' if x < 0 else 'green' for x in correlations.values]

bars = plt.barh(range(len(correlations)), correlations.values, color=colors, alpha=0.7)
plt.yticks(range(len(correlations)), correlations.index)
plt.xlabel('Correlation with House Price')
plt.title('🔗 Feature Correlations with House Prices', fontweight='bold', fontsize=16)
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)

# Add value labels
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + (0.01 if width > 0 else -0.01), bar.get_y() + bar.get_height()/2, 
             f'{width:.3f}', ha='left' if width > 0 else 'right', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("🔍 Top Positive Correlations:")
positive_corr = correlations[correlations > 0].head(3)
for feature, corr in positive_corr.items():
    print(f"• {feature}: {corr:.3f}")

print("\n🔍 Top Negative Correlations:")
negative_corr = correlations[correlations < 0].tail(3)
for feature, corr in negative_corr.items():
    print(f"• {feature}: {corr:.3f}")

In [None]:
# Scatter plots for key relationships
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('🔍 Key Feature Relationships with House Prices', fontsize=16, fontweight='bold')

# RM (rooms) vs Price
axes[0,0].scatter(df['RM'], df['MEDV'], alpha=0.6, color='blue')
axes[0,0].set_xlabel('Average Number of Rooms (RM)')
axes[0,0].set_ylabel('Price ($1000s)')
axes[0,0].set_title('🏠 Rooms vs Price')

# LSTAT vs Price
axes[0,1].scatter(df['LSTAT'], df['MEDV'], alpha=0.6, color='red')
axes[0,1].set_xlabel('% Lower Status Population (LSTAT)')
axes[0,1].set_ylabel('Price ($1000s)')
axes[0,1].set_title('📉 Lower Status % vs Price')

# CRIM vs Price
axes[1,0].scatter(df['CRIM'], df['MEDV'], alpha=0.6, color='orange')
axes[1,0].set_xlabel('Crime Rate (CRIM)')
axes[1,0].set_ylabel('Price ($1000s)')
axes[1,0].set_title('🚨 Crime Rate vs Price')

# DIS vs Price
axes[1,1].scatter(df['DIS'], df['MEDV'], alpha=0.6, color='green')
axes[1,1].set_xlabel('Distance to Employment Centers (DIS)')
axes[1,1].set_ylabel('Price ($1000s)')
axes[1,1].set_title('🏢 Distance to Work vs Price')

plt.tight_layout()
plt.show()

In [None]:
# Scatter plots for key relationships
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('🔍 Key Feature Relationships with House Prices', fontsize=16, fontweight='bold')

# RM (rooms) vs Price
axes[0,0].scatter(df['RM'], df['MEDV'], alpha=0.6, color='blue')
axes[0,0].set_xlabel('Average Number of Rooms (RM)')
axes[0,0].set_ylabel('Price ($1000s)')
axes[0,0].set_title('🏠 Rooms vs Price')

# LSTAT vs Price
axes[0,1].scatter(df['LSTAT'], df['MEDV'], alpha=0.6, color='red')
axes[0,1].set_xlabel('% Lower Status Population (LSTAT)')
axes[0,1].set_ylabel('Price ($1000s)')
axes[0,1].set_title('📉 Lower Status % vs Price')

# CRIM vs Price
axes[1,0].scatter(df['CRIM'], df['MEDV'], alpha=0.6, color='orange')
axes[1,0].set_xlabel('Crime Rate (CRIM)')
axes[1,0].set_ylabel('Price ($1000s)')
axes[1,0].set_title('🚨 Crime Rate vs Price')

# DIS vs Price
axes[1,1].scatter(df['DIS'], df['MEDV'], alpha=0.6, color='green')
axes[1,1].set_xlabel('Distance to Employment Centers (DIS)')
axes[1,1].set_ylabel('Price ($1000s)')
axes[1,1].set_title('🏢 Distance to Work vs Price')

plt.tight_layout()
plt.show()