# Lab Exam - Set 7

This notebook contains implementations for all questions in Set 7.

## Question 25: NumPy Arrays - Element-wise Computations and Matrix Operations

**Concepts:**
- **Element-wise operations**: Operations applied to each element individually (addition, multiplication, square, etc.)
- **Broadcasting**: NumPy's ability to perform operations on arrays of different shapes
- **Matrix operations**: Linear algebra operations (dot product, matrix multiplication, transpose, inverse)
- **Dot product**: Sum of element-wise multiplication of two arrays
- **Matrix multiplication**: Row-by-column multiplication using `@` or `np.matmul()`

In [None]:
import numpy as np

print("="*60)
print("ELEMENT-WISE COMPUTATIONS")
print("="*60)

# Create sample arrays
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.array([10, 20, 30, 40, 50])

print("\nArray 1:", arr1)
print("Array 2:", arr2)

# Element-wise addition
addition = arr1 + arr2
print("\nElement-wise Addition:", addition)

# Element-wise subtraction
subtraction = arr2 - arr1
print("Element-wise Subtraction:", subtraction)

# Element-wise multiplication
multiplication = arr1 * arr2
print("Element-wise Multiplication:", multiplication)

# Element-wise division
division = arr2 / arr1
print("Element-wise Division:", division)

# Element-wise power
power = arr1 ** 2
print("\nElement-wise Square (arr1^2):", power)

# Element-wise square root
sqrt = np.sqrt(arr2)
print("Element-wise Square Root (sqrt(arr2)):", sqrt)

# Element-wise exponential
exp = np.exp(arr1)
print("Element-wise Exponential (e^arr1):", exp)

# Broadcasting example
print("\n" + "="*60)
print("BROADCASTING")
print("="*60)

matrix = np.array([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9]])
scalar = 10
vector = np.array([1, 2, 3])

print("\nOriginal Matrix:")
print(matrix)

# Scalar broadcasting
print("\nAdd scalar 10 to all elements:")
print(matrix + scalar)

# Vector broadcasting (adds vector to each row)
print("\nAdd vector [1, 2, 3] to each row:")
print(matrix + vector)

print("\n" + "="*60)
print("MATRIX OPERATIONS")
print("="*60)

# Create matrices for operations
matrix_a = np.array([[1, 2, 3],
                     [4, 5, 6]])
matrix_b = np.array([[7, 8],
                     [9, 10],
                     [11, 12]])

print("\nMatrix A (2x3):")
print(matrix_a)
print("\nMatrix B (3x2):")
print(matrix_b)

# Matrix multiplication (dot product)
mat_mult = np.matmul(matrix_a, matrix_b)  # or matrix_a @ matrix_b
print("\nMatrix Multiplication (A @ B) - Result (2x2):")
print(mat_mult)

# Dot product of vectors
vec1 = np.array([1, 2, 3])
vec2 = np.array([4, 5, 6])
dot_product = np.dot(vec1, vec2)
print("\nDot product of [1,2,3] and [4,5,6]:", dot_product)

# Transpose
print("\nTranspose of Matrix A:")
print(matrix_a.T)

# Square matrix for more operations
square_matrix = np.array([[4, 7],
                          [2, 6]])
print("\nSquare Matrix (2x2):")
print(square_matrix)

# Determinant
det = np.linalg.det(square_matrix)
print("\nDeterminant:", det)

# Inverse
inverse = np.linalg.inv(square_matrix)
print("\nInverse Matrix:")
print(inverse)

# Verify inverse (A @ A_inv should give identity matrix)
identity = square_matrix @ inverse
print("\nVerification (A @ A_inverse = I):")
print(np.round(identity, 2))  # Round to 2 decimals for clean display

# Eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(square_matrix)
print("\nEigenvalues:", eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

## Question 26: Data Visualization - Bar Plots and Histograms for Price Features

**Concepts:**
- **Bar plot**: Categorical data visualization showing values with rectangular bars
- **Histogram**: Shows frequency distribution of continuous numerical data
- **Price analysis**: Understanding price distributions, ranges, and patterns
- **Matplotlib & Seaborn**: Python libraries for creating visualizations
- **Bins**: Intervals that group continuous data in histograms

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better-looking plots
sns.set_style('whitegrid')

# Create sample price-related dataset
np.random.seed(42)
n_samples = 100

data = {
    'Product': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books', 'Furniture'], n_samples),
    'Price': np.random.normal(500, 200, n_samples).clip(50, 2000),  # Prices between 50 and 2000
    'Original_Price': np.random.normal(600, 250, n_samples).clip(100, 2500),
    'Shipping_Cost': np.random.uniform(10, 100, n_samples),
    'Rating': np.random.uniform(1, 5, n_samples)
}

df = pd.DataFrame(data)
df['Discount'] = df['Original_Price'] - df['Price']
df['Total_Cost'] = df['Price'] + df['Shipping_Cost']

print("Sample Data:")
print(df.head(10))
print("\nDataset Info:")
print(df.describe())

# Create figure with multiple subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Price Analysis - Bar Plots and Histograms', fontsize=16, fontweight='bold')

# 1. Histogram of Prices
axes[0, 0].hist(df['Price'], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Price Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Price ($)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['Price'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: ${df["Price"].mean():.2f}')
axes[0, 0].legend()

# 2. Histogram of Total Cost
axes[0, 1].hist(df['Total_Cost'], bins=20, color='lightcoral', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Total Cost Distribution (Price + Shipping)', fontweight='bold')
axes[0, 1].set_xlabel('Total Cost ($)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(df['Total_Cost'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: ${df["Total_Cost"].median():.2f}')
axes[0, 1].legend()

# 3. Histogram of Discount
axes[0, 2].hist(df['Discount'], bins=20, color='lightgreen', edgecolor='black', alpha=0.7)
axes[0, 2].set_title('Discount Distribution', fontweight='bold')
axes[0, 2].set_xlabel('Discount ($)')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].axvline(df['Discount'].mean(), color='red', linestyle='--', linewidth=2, label=f'Avg: ${df["Discount"].mean():.2f}')
axes[0, 2].legend()

# 4. Bar plot - Average Price by Product Category
avg_price_by_product = df.groupby('Product')['Price'].mean().sort_values(ascending=False)
axes[1, 0].bar(avg_price_by_product.index, avg_price_by_product.values, color='steelblue', edgecolor='black')
axes[1, 0].set_title('Average Price by Product Category', fontweight='bold')
axes[1, 0].set_xlabel('Product Category')
axes[1, 0].set_ylabel('Average Price ($)')
axes[1, 0].tick_params(axis='x', rotation=45)
# Add value labels on bars
for i, v in enumerate(avg_price_by_product.values):
    axes[1, 0].text(i, v + 10, f'${v:.2f}', ha='center', va='bottom', fontweight='bold')

# 5. Bar plot - Average Shipping Cost by Product Category
avg_shipping_by_product = df.groupby('Product')['Shipping_Cost'].mean().sort_values(ascending=False)
axes[1, 1].bar(avg_shipping_by_product.index, avg_shipping_by_product.values, color='orange', edgecolor='black')
axes[1, 1].set_title('Average Shipping Cost by Product Category', fontweight='bold')
axes[1, 1].set_xlabel('Product Category')
axes[1, 1].set_ylabel('Average Shipping Cost ($)')
axes[1, 1].tick_params(axis='x', rotation=45)
# Add value labels on bars
for i, v in enumerate(avg_shipping_by_product.values):
    axes[1, 1].text(i, v + 1, f'${v:.2f}', ha='center', va='bottom', fontweight='bold')

# 6. Bar plot - Product Count by Category
product_counts = df['Product'].value_counts()
axes[1, 2].bar(product_counts.index, product_counts.values, color='mediumpurple', edgecolor='black')
axes[1, 2].set_title('Product Count by Category', fontweight='bold')
axes[1, 2].set_xlabel('Product Category')
axes[1, 2].set_ylabel('Count')
axes[1, 2].tick_params(axis='x', rotation=45)
# Add value labels on bars
for i, v in enumerate(product_counts.values):
    axes[1, 2].text(i, v + 0.5, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Additional visualization - Combined histogram with multiple price features
fig, ax = plt.subplots(figsize=(12, 6))
ax.hist(df['Price'], bins=20, alpha=0.5, label='Price', color='blue', edgecolor='black')
ax.hist(df['Original_Price'], bins=20, alpha=0.5, label='Original Price', color='green', edgecolor='black')
ax.hist(df['Total_Cost'], bins=20, alpha=0.5, label='Total Cost', color='red', edgecolor='black')
ax.set_title('Overlapping Histograms - Price Comparison', fontsize=14, fontweight='bold')
ax.set_xlabel('Amount ($)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print summary statistics
print("\n" + "="*60)
print("PRICE SUMMARY STATISTICS")
print("="*60)
print(f"\nAverage Price: ${df['Price'].mean():.2f}")
print(f"Median Price: ${df['Price'].median():.2f}")
print(f"Price Range: ${df['Price'].min():.2f} - ${df['Price'].max():.2f}")
print(f"\nAverage Discount: ${df['Discount'].mean():.2f}")
print(f"Average Shipping Cost: ${df['Shipping_Cost'].mean():.2f}")
print(f"Average Total Cost: ${df['Total_Cost'].mean():.2f}")

## Question 27: Bar and Density Plots - Feature Distribution Comparison Across Categories

**Concepts:**
- **Category comparison**: Analyzing how features differ across different groups
- **Density plot (KDE)**: Kernel Density Estimation - smooth curve showing probability distribution
- **Grouped bar plots**: Comparing multiple categories side by side
- **Distribution analysis**: Understanding data spread, central tendency, and patterns
- **Seaborn**: Advanced statistical visualization library built on matplotlib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
sns.set_palette('husl')

# Create a comprehensive dataset with multiple categories
np.random.seed(42)
n_samples = 200

# Create data with different distributions for different categories
categories_list = []
for cat in ['Category A', 'Category B', 'Category C', 'Category D']:
    if cat == 'Category A':
        temp_data = {
            'Category': [cat] * 50,
            'Feature1': np.random.normal(50, 10, 50),
            'Feature2': np.random.normal(100, 15, 50),
            'Feature3': np.random.exponential(30, 50),
            'Age': np.random.randint(20, 60, 50)
        }
    elif cat == 'Category B':
        temp_data = {
            'Category': [cat] * 50,
            'Feature1': np.random.normal(65, 12, 50),
            'Feature2': np.random.normal(85, 20, 50),
            'Feature3': np.random.exponential(40, 50),
            'Age': np.random.randint(25, 65, 50)
        }
    elif cat == 'Category C':
        temp_data = {
            'Category': [cat] * 50,
            'Feature1': np.random.normal(40, 8, 50),
            'Feature2': np.random.normal(110, 18, 50),
            'Feature3': np.random.exponential(25, 50),
            'Age': np.random.randint(18, 55, 50)
        }
    else:  # Category D
        temp_data = {
            'Category': [cat] * 50,
            'Feature1': np.random.normal(55, 15, 50),
            'Feature2': np.random.normal(95, 12, 50),
            'Feature3': np.random.exponential(35, 50),
            'Age': np.random.randint(22, 70, 50)
        }
    categories_list.append(pd.DataFrame(temp_data))

df = pd.concat(categories_list, ignore_index=True)

print("Sample Data:")
print(df.head(10))
print("\nData shape:", df.shape)
print("\nCategory counts:")
print(df['Category'].value_counts())

# Create comprehensive visualization
fig = plt.figure(figsize=(18, 12))
fig.suptitle('Feature Distribution Comparison Across Categories', fontsize=16, fontweight='bold')

# Define features to analyze
features = ['Feature1', 'Feature2', 'Feature3']

# Plot 1-3: Bar plots comparing mean values
for idx, feature in enumerate(features):
    ax = plt.subplot(3, 4, idx + 1)
    mean_values = df.groupby('Category')[feature].mean()
    bars = ax.bar(mean_values.index, mean_values.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A'], 
                   edgecolor='black', linewidth=1.5)
    ax.set_title(f'{feature} - Mean by Category', fontweight='bold')
    ax.set_xlabel('Category')
    ax.set_ylabel(f'Mean {feature}')
    ax.tick_params(axis='x', rotation=45)
    # Add value labels
    for i, bar in enumerate(bars):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}', ha='center', va='bottom', fontweight='bold')

# Plot 4: Grouped bar plot for all features
ax = plt.subplot(3, 4, 4)
x = np.arange(len(df['Category'].unique()))
width = 0.25
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

for i, feature in enumerate(features):
    means = [df[df['Category'] == cat][feature].mean() for cat in df['Category'].unique()]
    ax.bar(x + i*width, means, width, label=feature, color=colors[i], edgecolor='black')

ax.set_title('All Features Comparison', fontweight='bold')
ax.set_xlabel('Category')
ax.set_ylabel('Mean Value')
ax.set_xticks(x + width)
ax.set_xticklabels(df['Category'].unique(), rotation=45)
ax.legend()

# Plot 5-7: Density plots for each feature across categories
for idx, feature in enumerate(features):
    ax = plt.subplot(3, 4, idx + 5)
    for category in df['Category'].unique():
        data = df[df['Category'] == category][feature]
        data.plot(kind='density', ax=ax, label=category, linewidth=2.5)
    ax.set_title(f'{feature} - Density Plot', fontweight='bold')
    ax.set_xlabel(feature)
    ax.set_ylabel('Density')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Plot 8: Combined density plot for Age
ax = plt.subplot(3, 4, 8)
for category in df['Category'].unique():
    data = df[df['Category'] == category]['Age']
    data.plot(kind='density', ax=ax, label=category, linewidth=2.5)
ax.set_title('Age Distribution by Category', fontweight='bold')
ax.set_xlabel('Age')
ax.set_ylabel('Density')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 9-11: Box plots for distribution comparison
for idx, feature in enumerate(features):
    ax = plt.subplot(3, 4, idx + 9)
    df.boxplot(column=feature, by='Category', ax=ax, patch_artist=True)
    ax.set_title(f'{feature} - Box Plot')
    ax.set_xlabel('Category')
    ax.set_ylabel(feature)
    plt.sca(ax)
    plt.xticks(rotation=45)

# Plot 12: Violin plot for Feature1
ax = plt.subplot(3, 4, 12)
parts = ax.violinplot([df[df['Category'] == cat]['Feature1'].values 
                       for cat in df['Category'].unique()],
                      positions=range(len(df['Category'].unique())),
                      showmeans=True, showmedians=True)
ax.set_title('Feature1 - Violin Plot', fontweight='bold')
ax.set_xlabel('Category')
ax.set_ylabel('Feature1')
ax.set_xticks(range(len(df['Category'].unique())))
ax.set_xticklabels(df['Category'].unique(), rotation=45)

plt.tight_layout()
plt.show()

# Additional: Seaborn FacetGrid for advanced comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Seaborn Advanced Distribution Plots', fontsize=14, fontweight='bold')

for idx, feature in enumerate(features):
    sns.kdeplot(data=df, x=feature, hue='Category', fill=True, ax=axes[idx], alpha=0.5, linewidth=2)
    axes[idx].set_title(f'{feature} Distribution', fontweight='bold')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Density')

plt.tight_layout()
plt.show()

# Print statistical summary
print("\n" + "="*80)
print("STATISTICAL SUMMARY BY CATEGORY")
print("="*80)
for category in df['Category'].unique():
    print(f"\n{category}:")
    print(df[df['Category'] == category][features].describe().round(2))

## Question 28: Linear Regression - House Price Prediction with Residual Analysis

**Concepts:**
- **Linear Regression**: Supervised learning algorithm that models relationship between features and target
- **House Price Prediction**: Predicting house prices based on features like area, rooms, location, etc.
- **Residuals**: Difference between actual and predicted values (Error = Actual - Predicted)
- **Residual Plot**: Scatter plot showing residuals vs predicted values (checks model assumptions)
- **R² Score**: Coefficient of determination - how well model explains variance (0 to 1)
- **MSE/RMSE**: Mean Squared Error / Root Mean Squared Error - measures prediction accuracy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Set style
sns.set_style('whitegrid')

# Create synthetic house price dataset
np.random.seed(42)
n_samples = 200

# Generate features
area = np.random.uniform(500, 3500, n_samples)  # Square feet
bedrooms = np.random.randint(1, 6, n_samples)
bathrooms = np.random.randint(1, 4, n_samples)
age = np.random.randint(0, 50, n_samples)  # Years old
distance_to_city = np.random.uniform(1, 30, n_samples)  # Miles
garage = np.random.randint(0, 3, n_samples)

# Generate target (price) with realistic relationships
# Price formula: base + area_effect + room_effects - age_effect - distance_effect + noise
price = (100000 + 
         area * 150 + 
         bedrooms * 20000 + 
         bathrooms * 15000 + 
         garage * 10000 -
         age * 1000 - 
         distance_to_city * 2000 + 
         np.random.normal(0, 30000, n_samples))  # Add noise

# Create DataFrame
df = pd.DataFrame({
    'Area_sqft': area,
    'Bedrooms': bedrooms,
    'Bathrooms': bathrooms,
    'Age_years': age,
    'Distance_to_City_miles': distance_to_city,
    'Garage_spaces': garage,
    'Price': price
})

print("House Price Dataset:")
print(df.head(10))
print("\nDataset shape:", df.shape)
print("\nDataset Statistics:")
print(df.describe().round(2))

# Prepare features (X) and target (y)
X = df.drop('Price', axis=1)
y = df['Price']

print("\nFeatures:", X.columns.tolist())
print("Target: Price")

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# Create and train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

print("\n" + "="*80)
print("MODEL TRAINING COMPLETE")
print("="*80)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate residuals (errors)
train_residuals = y_train - y_train_pred
test_residuals = y_test - y_test_pred

# Calculate metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Print model performance
print("\nMODEL PERFORMANCE METRICS:")
print("-" * 80)
print(f"{'Metric':<30} {'Training Set':<25} {'Testing Set':<25}")
print("-" * 80)
print(f"{'R² Score:':<30} {train_r2:<25.4f} {test_r2:<25.4f}")
print(f"{'RMSE:':<30} ${train_rmse:<24,.2f} ${test_rmse:<24,.2f}")
print(f"{'MAE:':<30} ${train_mae:<24,.2f} ${test_mae:<24,.2f}")
print("-" * 80)

# Print feature coefficients
print("\nFEATURE COEFFICIENTS:")
print("-" * 80)
print(f"{'Feature':<30} {'Coefficient':<25} {'Impact'}")
print("-" * 80)
print(f"{'Intercept:':<30} ${model.intercept_:<24,.2f}")
for feature, coef in zip(X.columns, model.coef_):
    impact = "Positive" if coef > 0 else "Negative"
    print(f"{feature:<30} ${coef:<24,.2f} {impact}")
print("-" * 80)

# Create comprehensive visualization
fig = plt.figure(figsize=(18, 12))
fig.suptitle('Linear Regression - House Price Prediction Analysis', fontsize=16, fontweight='bold')

# Plot 1: Actual vs Predicted (Training Set)
ax1 = plt.subplot(3, 3, 1)
ax1.scatter(y_train, y_train_pred, alpha=0.6, color='blue', edgecolor='black')
ax1.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 
         'r--', lw=2, label='Perfect Prediction')
ax1.set_xlabel('Actual Price ($)', fontweight='bold')
ax1.set_ylabel('Predicted Price ($)', fontweight='bold')
ax1.set_title(f'Training Set: Actual vs Predicted\nR² = {train_r2:.4f}', fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Actual vs Predicted (Testing Set)
ax2 = plt.subplot(3, 3, 2)
ax2.scatter(y_test, y_test_pred, alpha=0.6, color='green', edgecolor='black')
ax2.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
         'r--', lw=2, label='Perfect Prediction')
ax2.set_xlabel('Actual Price ($)', fontweight='bold')
ax2.set_ylabel('Predicted Price ($)', fontweight='bold')
ax2.set_title(f'Testing Set: Actual vs Predicted\nR² = {test_r2:.4f}', fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Plot 3: Feature Coefficients
ax3 = plt.subplot(3, 3, 3)
colors = ['green' if c > 0 else 'red' for c in model.coef_]
bars = ax3.barh(X.columns, model.coef_, color=colors, edgecolor='black')
ax3.set_xlabel('Coefficient Value', fontweight='bold')
ax3.set_title('Feature Importance (Coefficients)', fontweight='bold')
ax3.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
ax3.grid(True, alpha=0.3, axis='x')

# Plot 4: Residuals vs Predicted (Training Set)
ax4 = plt.subplot(3, 3, 4)
ax4.scatter(y_train_pred, train_residuals, alpha=0.6, color='blue', edgecolor='black')
ax4.axhline(y=0, color='red', linestyle='--', linewidth=2)
ax4.set_xlabel('Predicted Price ($)', fontweight='bold')
ax4.set_ylabel('Residuals ($)', fontweight='bold')
ax4.set_title('Training Set: Residual Plot', fontweight='bold')
ax4.grid(True, alpha=0.3)

# Plot 5: Residuals vs Predicted (Testing Set)
ax5 = plt.subplot(3, 3, 5)
ax5.scatter(y_test_pred, test_residuals, alpha=0.6, color='green', edgecolor='black')
ax5.axhline(y=0, color='red', linestyle='--', linewidth=2)
ax5.set_xlabel('Predicted Price ($)', fontweight='bold')
ax5.set_ylabel('Residuals ($)', fontweight='bold')
ax5.set_title('Testing Set: Residual Plot', fontweight='bold')
ax5.grid(True, alpha=0.3)

# Plot 6: Distribution of Residuals (Testing Set)
ax6 = plt.subplot(3, 3, 6)
ax6.hist(test_residuals, bins=30, color='orange', edgecolor='black', alpha=0.7)
ax6.axvline(x=0, color='red', linestyle='--', linewidth=2, label='Zero Error')
ax6.set_xlabel('Residuals ($)', fontweight='bold')
ax6.set_ylabel('Frequency', fontweight='bold')
ax6.set_title('Distribution of Residuals (Testing Set)', fontweight='bold')
ax6.legend()
ax6.grid(True, alpha=0.3)

# Plot 7: Q-Q Plot for Residuals (Testing Set)
ax7 = plt.subplot(3, 3, 7)
from scipy import stats
stats.probplot(test_residuals, dist="norm", plot=ax7)
ax7.set_title('Q-Q Plot: Residuals Normality Check', fontweight='bold')
ax7.grid(True, alpha=0.3)

# Plot 8: Prediction Error Distribution
ax8 = plt.subplot(3, 3, 8)
error_percentage = (test_residuals / y_test) * 100
ax8.hist(error_percentage, bins=30, color='purple', edgecolor='black', alpha=0.7)
ax8.axvline(x=0, color='red', linestyle='--', linewidth=2)
ax8.set_xlabel('Error Percentage (%)', fontweight='bold')
ax8.set_ylabel('Frequency', fontweight='bold')
ax8.set_title('Prediction Error Distribution (%)', fontweight='bold')
ax8.grid(True, alpha=0.3)

# Plot 9: Model Performance Comparison
ax9 = plt.subplot(3, 3, 9)
metrics = ['R² Score', 'RMSE', 'MAE']
train_metrics = [train_r2, train_rmse/1000, train_mae/1000]  # Scale RMSE and MAE
test_metrics = [test_r2, test_rmse/1000, test_mae/1000]

x = np.arange(len(metrics))
width = 0.35

bars1 = ax9.bar(x - width/2, train_metrics, width, label='Training', color='blue', edgecolor='black')
bars2 = ax9.bar(x + width/2, test_metrics, width, label='Testing', color='green', edgecolor='black')

ax9.set_ylabel('Value', fontweight='bold')
ax9.set_title('Model Performance Comparison\n(RMSE & MAE in $1000s)', fontweight='bold')
ax9.set_xticks(x)
ax9.set_xticklabels(metrics)
ax9.legend()
ax9.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax9.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

# Sample predictions
print("\n" + "="*80)
print("SAMPLE PREDICTIONS (First 10 from Test Set)")
print("="*80)
comparison_df = pd.DataFrame({
    'Actual_Price': y_test.values[:10],
    'Predicted_Price': y_test_pred[:10],
    'Residual': test_residuals.values[:10],
    'Error_%': ((test_residuals.values[:10] / y_test.values[:10]) * 100)
})
print(comparison_df.round(2))

print("\n" + "="*80)
print("MODEL INTERPRETATION")
print("="*80)
print("The Linear Regression model successfully predicts house prices based on features.")
print(f"\nR² Score: {test_r2:.4f} means the model explains {test_r2*100:.2f}% of price variance.")
print(f"RMSE: ${test_rmse:,.2f} is the average prediction error magnitude.")
print("\nResidual plots show random scatter around zero, indicating good model fit.")
print("Q-Q plot helps verify that residuals follow normal distribution (good model assumption).")