In [4]:
#QUESTION 1 & 2
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('housing/housing.csv')

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Handle missing values in total_bedrooms
# Since total_bedrooms is correlated with households and total_rooms,
# we can use median imputation by grouping similar properties

# Calculate median bedrooms per household for each ocean_proximity category
bedroom_median_by_location = df.groupby('ocean_proximity')['total_bedrooms'].median()

# Impute missing values
df['total_bedrooms'] = df.apply(
    lambda row: bedroom_median_by_location[row['ocean_proximity']]
    if pd.isna(row['total_bedrooms'])
    else row['total_bedrooms'],
    axis=1
)

# Verify no more missing values
print("\nMissing values after imputation:")
print(df.isnull().sum())

# Save the cleaned dataset
df.to_csv('housing/cleaned_housing_data.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_housing_data.csv'")
# Alternative approach: Simple median imputation (if you prefer)
# df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

Missing values per column:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

Missing values after imputation:
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

Cleaned dataset saved as 'cleaned_housing_data.csv'


In [5]:
#QUESTION 4, HANDLE THE NON-NUMERICAL FIELD
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# One-hot encoding with pandas
ohe_df = pd.get_dummies(df['ocean_proximity'], prefix='ocean')

# Concatenate with original dataframe
df_encoded = pd.concat([df.drop('ocean_proximity', axis=1), ohe_df], axis=1)

print("One-Hot Encoding Results:")
print(ohe_df.head())
print(f"\nNew columns: {list(ohe_df.columns)}")

One-Hot Encoding Results:
   ocean_<1H OCEAN  ocean_INLAND  ocean_ISLAND  ocean_NEAR BAY  \
0            False         False         False            True   
1            False         False         False            True   
2            False         False         False            True   
3            False         False         False            True   
4            False         False         False            True   

   ocean_NEAR OCEAN  
0             False  
1             False  
2             False  
3             False  
4             False  

New columns: ['ocean_<1H OCEAN', 'ocean_INLAND', 'ocean_ISLAND', 'ocean_NEAR BAY', 'ocean_NEAR OCEAN']


In [9]:
#QUESTION 5 :: Use Scikit-learn Pipeline class for transformation

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd


# Custom transformer for your specific bedroom imputation strategy
class BedroomImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.bedroom_medians_ = None

    def fit(self, X, y=None):
        # Calculate median bedrooms by ocean_proximity during fitting
        self.bedroom_medians_ = X.groupby('ocean_proximity')['total_bedrooms'].median()
        return self

    def transform(self, X):
        X_copy = X.copy()
        # Apply the same imputation logic you used
        X_copy['total_bedrooms'] = X_copy.apply(
            lambda row: self.bedroom_medians_[row['ocean_proximity']]
            if pd.isna(row['total_bedrooms'])
            else row['total_bedrooms'],
            axis=1
        )
        return X_copy


# Define numeric and categorical features
numeric_features = ['longitude', 'latitude', 'housing_median_age',
                    'total_rooms', 'total_bedrooms', 'population',
                    'households', 'median_income']
categorical_features = ['ocean_proximity']

# Create the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    # Your custom bedroom imputation
    ('bedroom_imputer', BedroomImputer()),

    # Column transformer for different data types
    ('column_transformer', ColumnTransformer([
        # Numeric features: scale them
        ('numeric', StandardScaler(), numeric_features),

        # Categorical features: one-hot encode
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
         categorical_features)
    ]))
])

# Example of how to use the pipeline:
if __name__ == "__main__":
    # Load your data
    df = pd.read_csv('housing/housing.csv')

    # Split data (you'll want to do this properly)
    from sklearn.model_selection import train_test_split

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Fit and transform the training data
    X_train_processed = preprocessing_pipeline.fit_transform(train_df)

    # Transform the test data (using learned parameters from training)
    X_test_processed = preprocessing_pipeline.transform(test_df)

    print("Training data shape after preprocessing:", X_train_processed.shape)
    print("Test data shape after preprocessing:", X_test_processed.shape)

    # You can now use X_train_processed and X_test_processed for modeling

Training data shape after preprocessing: (16512, 13)
Test data shape after preprocessing: (4128, 13)


In [8]:
#QUESTION 6 :: Train a Linear Regression model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the original raw data
df = pd.read_csv('housing/housing.csv')

# Separate features and target
X = df.drop('median_house_value', axis=1)  # Features
y = df['median_house_value']  # Target

# Split the data (using original data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create the full pipeline with Linear Regression
full_pipeline = Pipeline([
    # Your preprocessing steps
    ('bedroom_imputer', BedroomImputer()),

    ('column_transformer', ColumnTransformer([
        ('numeric', StandardScaler(), numeric_features),
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
         categorical_features)
    ])),

    # Add Linear Regression model
    ('linear_regression', LinearRegression())
])

# Train the model
print("Training Linear Regression model...")
full_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = full_pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse:,.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")
print(f"R² Score: {r2:.4f}")

# Optional: Show some predictions vs actual values
print("\nSample predictions vs actual:")
results_df = pd.DataFrame({
    'Actual': y_test.values[:10],
    'Predicted': y_pred[:10],
    'Difference': y_test.values[:10] - y_pred[:10]
})
print(results_df.round(2))

Training Linear Regression model...

Model Evaluation:
Mean Squared Error (MSE): 4,907,069,535.27
Root Mean Squared Error (RMSE): 70,050.48
R² Score: 0.6255

Sample predictions vs actual:
     Actual  Predicted  Difference
0   47700.0   53130.34    -5430.34
1   45800.0  123300.23   -77500.23
2  500001.0  254769.85   245231.15
3  218600.0  268413.59   -49813.59
4  278000.0  265647.54    12352.46
5  158700.0  138681.20    20018.80
6  198200.0  291076.58   -92876.58
7  157500.0  228676.03   -71176.03
8  340000.0  255787.26    84212.74
9  446600.0  408335.02    38264.98


In [10]:
#QUESTION 7 : Interpret the results

# Interpretation of Results:
# 1. Overall Model Performance:
# R² Score: 0.6255 - This means your model explains 62.55% of the variance in housing prices. This is actually quite good for a linear regression on real estate data, where many factors influence prices.
#
# RMSE: $70,050 - On average, your predictions are off by about $70,000 from the actual prices.
#
# 2. What These Numbers Mean in Context:
# Good news: The model has learned meaningful patterns (62.55% explained variance is respectable)
#
# Reality check: Housing prices are complex! The $70K average error shows there are factors not captured by your features
#
# 3. Sample Predictions Analysis:
# Looking at your sample predictions:
#
# Good Predictions:
#
# Row 4: Predicted $265,647 vs Actual $278,000 (only $12,352 off - 4.4% error)
#
# Row 9: Predicted $408,335 vs Actual $446,600 ($38,265 off - 8.6% error)
#
# Row 5: Predicted $138,681 vs Actual $158,700 ($20,019 off - 12.6% error)
#
# Problematic Predictions:
#
# Row 1: Predicted $123,300 vs Actual $45,800 (overestimated by 169%)
#
# Row 6: Predicted $291,077 vs Actual $198,200 (overestimated by 47%)
#
# Row 2: Predicted $254,770 vs Actual $500,001 (underestimated by 49%)
#
# 4. Why Some Predictions Are So Far Off:
# Extreme values: California housing has very high price variations
#
# Non-linear relationships: Linear regression assumes straight-line relationships, but housing prices often have complex, non-linear patterns
#
# Missing features: Factors like school quality, crime rates, proximity to amenities aren't in your dataset

# 6. Business Interpretation:
# Your model is good enough for initial estimates but shouldn't be used for precise valuations. It could be useful for:
#
# Quick market analysis
#
# Identifying undervalued/overvalued properties (large differences like row 2)
#
# Understanding which factors most influence prices
#
# Conclusion: Solid first model! The 62.55% R² is respectable, but the high RMSE and some large errors suggest either need for better algorithms or more features.

In [None]:
#QUESTION 8 ::: Calculate the Root-Mean-Squared-Error of your model

#RESULTS FROM QUESTION 6, the MRSE is already calculated

# Model Evaluation:
# Mean Squared Error (MSE): 4,907,069,535.27
# Root Mean Squared Error (RMSE): 70,050.48
# R² Score: 0.6255

#
# Your RMSE Result:
# RMSE = $70,050.48
#
# How This Was Calculated:
# From your output:
#
# MSE = 4,907,069,535.27 (Mean Squared Error)
#
# RMSE = √MSE = √4,907,069,535.27 = $70,050.48
#
# What This RMSE Means:
# On average, your predictions are off by approximately $70,050 from the actual housing prices
#
# This is calculated as: RMSE = sqrt(mean((actual - predicted)²))


In [11]:
#QUESTION 9 : Compare your previous model with a Decision Tree Regression model

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

# Create both pipelines
linear_pipeline = Pipeline([
    ('bedroom_imputer', BedroomImputer()),
    ('column_transformer', ColumnTransformer([
        ('numeric', StandardScaler(), numeric_features),
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
         categorical_features)
    ])),
    ('model', LinearRegression())
])

decision_tree_pipeline = Pipeline([
    ('bedroom_imputer', BedroomImputer()),
    ('column_transformer', ColumnTransformer([
        ('numeric', 'passthrough', numeric_features),  # No scaling needed for trees
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
         categorical_features)
    ])),
    ('model', DecisionTreeRegressor(random_state=42))
])

# Train and evaluate both models
models = {
    'Linear Regression': linear_pipeline,
    'Decision Tree': decision_tree_pipeline
}

results = {}

for name, pipeline in models.items():
    print(f"\n=== {name} ===")

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Cross-validation for more robust evaluation
    cv_scores = cross_val_score(pipeline, X_train, y_train,
                                scoring='neg_mean_squared_error', cv=5)
    cv_rmse = np.sqrt(-cv_scores.mean())

    results[name] = {
        'RMSE': rmse,
        'R²': r2,
        'CV_RMSE': cv_rmse
    }

    print(f"RMSE: ${rmse:,.2f}")
    print(f"R² Score: {r2:.4f}")
    print(f"Cross-Validated RMSE: ${cv_rmse:,.2f}")

# Compare results
print("\n" + "=" * 50)
print("MODEL COMPARISON")
print("=" * 50)

for model_name, metrics in results.items():
    print(f"{model_name}:")
    print(f"  RMSE: ${metrics['RMSE']:,.2f}")
    print(f"  R²: {metrics['R²']:.4f}")
    print(f"  CV RMSE: ${metrics['CV_RMSE']:,.2f}")
    print()


# Expected Results Analysis:
# Linear Regression (Your Previous Model):
# RMSE: ~$70,050
#
# R²: ~0.625
#
# Pros: Simple, interpretable, good baseline
#
# Cons: Assumes linear relationships, may underfit complex patterns
#
# Decision Tree (New Model):
# Likely Results:
#
# Training RMSE: Very low ($0-20,000) - trees can overfit!
#
# Test RMSE: Probably similar or slightly better than linear regression
#
# R²: May be slightly higher (0.65-0.75)


=== Linear Regression ===
RMSE: $70,050.48
R² Score: 0.6255
Cross-Validated RMSE: $68,637.62

=== Decision Tree ===
RMSE: $69,133.85
R² Score: 0.6353
Cross-Validated RMSE: $69,759.16

MODEL COMPARISON
Linear Regression:
  RMSE: $70,050.48
  R²: 0.6255
  CV RMSE: $68,637.62

Decision Tree:
  RMSE: $69,133.85
  R²: 0.6353
  CV RMSE: $69,759.16



In [None]:
#QUESTION 10:::Evaluate your model

# 1. Your Linear Regression Results:
# Your actual results
linear_results = {
    'RMSE': 70050.48,
    'MSE': 4907069535.27,
    'R²': 0.6255
}


# 2. Expected Decision Tree Results (Typical Pattern):

# Typical decision tree performance on this dataset
decision_tree_results = {
    'RMSE': 65000 - 75000,  # Similar or slightly better than linear regression
    'R²': 0.65 - 0.70,      # Slight improvement
    'Training_RMSE': 10000 - 20000  # Much lower - indicates overfitting!
}

# Performance Evaluation Metrics
# Absolute Error Analysis:

# Calculate mean absolute error and percentage error
mean_price = y_test.mean()
print(f"Average house price: ${mean_price:,.2f}")
print(f"Linear Regression RMSE: ${linear_results['RMSE']:,.2f}")
print(f"Average error percentage: {(linear_results['RMSE']/mean_price)*100:.1f}%")

# Model Assessment
# Linear Regression Strengths:
# ✅ Interpretable: Coefficients show feature importance
#
# ✅ Robust: Less prone to overfitting
#
# ✅ Good baseline: 62.55% variance explained is respectable
#
# Linear Regression Weaknesses:
# ❌ Non-linear relationships: Housing data has complex patterns
#
# ❌ $70K average error: Too high for precise valuations
#
# ❌ Assumes linearity: Real estate markets don't work linearly

# Decision Tree Likely Performance:
# ✅ Better with non-linear patterns: May capture complex relationships
#
# ✅ Potential R² improvement: Could reach 0.65-0.70
#
# ❌ Overfitting risk: Great on training, worse on test data
#
# ❌ Less interpretable: Harder to explain to stakeholders

# Final Verdict: Your Linear Regression model is a good baseline (C+ grade) but needs improvement for production use. Decision Tree may slightly outperform it but requires careful tuning to avoid overfitting.
