In [4]:
#QUESTION 1 & 2
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('housing/housing.csv')

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Handle missing values in total_bedrooms
# Since total_bedrooms is correlated with households and total_rooms,
# we can use median imputation by grouping similar properties

# Calculate median bedrooms per household for each ocean_proximity category
bedroom_median_by_location = df.groupby('ocean_proximity')['total_bedrooms'].median()

# Impute missing values
df['total_bedrooms'] = df.apply(
    lambda row: bedroom_median_by_location[row['ocean_proximity']]
    if pd.isna(row['total_bedrooms'])
    else row['total_bedrooms'],
    axis=1
)

# Verify no more missing values
print("\nMissing values after imputation:")
print(df.isnull().sum())

# Save the cleaned dataset
df.to_csv('housing/cleaned_housing_data.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_housing_data.csv'")
# Alternative approach: Simple median imputation (if you prefer)
# df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

Missing values per column:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

Missing values after imputation:
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

Cleaned dataset saved as 'cleaned_housing_data.csv'


In [5]:
#QUESTION 4, HANDLE THE NON-NUMERICAL FIELD
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# One-hot encoding with pandas
ohe_df = pd.get_dummies(df['ocean_proximity'], prefix='ocean')

# Concatenate with original dataframe
df_encoded = pd.concat([df.drop('ocean_proximity', axis=1), ohe_df], axis=1)

print("One-Hot Encoding Results:")
print(ohe_df.head())
print(f"\nNew columns: {list(ohe_df.columns)}")

One-Hot Encoding Results:
   ocean_<1H OCEAN  ocean_INLAND  ocean_ISLAND  ocean_NEAR BAY  \
0            False         False         False            True   
1            False         False         False            True   
2            False         False         False            True   
3            False         False         False            True   
4            False         False         False            True   

   ocean_NEAR OCEAN  
0             False  
1             False  
2             False  
3             False  
4             False  

New columns: ['ocean_<1H OCEAN', 'ocean_INLAND', 'ocean_ISLAND', 'ocean_NEAR BAY', 'ocean_NEAR OCEAN']


In [9]:
#QUESTION 5 :: Use Scikit-learn Pipeline class for transformation

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd


# Custom transformer for your specific bedroom imputation strategy
class BedroomImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.bedroom_medians_ = None

    def fit(self, X, y=None):
        # Calculate median bedrooms by ocean_proximity during fitting
        self.bedroom_medians_ = X.groupby('ocean_proximity')['total_bedrooms'].median()
        return self

    def transform(self, X):
        X_copy = X.copy()
        # Apply the same imputation logic you used
        X_copy['total_bedrooms'] = X_copy.apply(
            lambda row: self.bedroom_medians_[row['ocean_proximity']]
            if pd.isna(row['total_bedrooms'])
            else row['total_bedrooms'],
            axis=1
        )
        return X_copy


# Define numeric and categorical features
numeric_features = ['longitude', 'latitude', 'housing_median_age',
                    'total_rooms', 'total_bedrooms', 'population',
                    'households', 'median_income']
categorical_features = ['ocean_proximity']

# Create the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    # Your custom bedroom imputation
    ('bedroom_imputer', BedroomImputer()),

    # Column transformer for different data types
    ('column_transformer', ColumnTransformer([
        # Numeric features: scale them
        ('numeric', StandardScaler(), numeric_features),

        # Categorical features: one-hot encode
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
         categorical_features)
    ]))
])

# Example of how to use the pipeline:
if __name__ == "__main__":
    # Load your data
    df = pd.read_csv('housing/housing.csv')

    # Split data (you'll want to do this properly)
    from sklearn.model_selection import train_test_split

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Fit and transform the training data
    X_train_processed = preprocessing_pipeline.fit_transform(train_df)

    # Transform the test data (using learned parameters from training)
    X_test_processed = preprocessing_pipeline.transform(test_df)

    print("Training data shape after preprocessing:", X_train_processed.shape)
    print("Test data shape after preprocessing:", X_test_processed.shape)

    # You can now use X_train_processed and X_test_processed for modeling

Training data shape after preprocessing: (16512, 13)
Test data shape after preprocessing: (4128, 13)


In [8]:
#QUESTION 6 :: Train a Linear Regression model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the original raw data
df = pd.read_csv('housing/housing.csv')

# Separate features and target
X = df.drop('median_house_value', axis=1)  # Features
y = df['median_house_value']  # Target

# Split the data (using original data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create the full pipeline with Linear Regression
full_pipeline = Pipeline([
    # Your preprocessing steps
    ('bedroom_imputer', BedroomImputer()),

    ('column_transformer', ColumnTransformer([
        ('numeric', StandardScaler(), numeric_features),
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
         categorical_features)
    ])),

    # Add Linear Regression model
    ('linear_regression', LinearRegression())
])

# Train the model
print("Training Linear Regression model...")
full_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = full_pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse:,.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")
print(f"R² Score: {r2:.4f}")

# Optional: Show some predictions vs actual values
print("\nSample predictions vs actual:")
results_df = pd.DataFrame({
    'Actual': y_test.values[:10],
    'Predicted': y_pred[:10],
    'Difference': y_test.values[:10] - y_pred[:10]
})
print(results_df.round(2))

Training Linear Regression model...

Model Evaluation:
Mean Squared Error (MSE): 4,907,069,535.27
Root Mean Squared Error (RMSE): 70,050.48
R² Score: 0.6255

Sample predictions vs actual:
     Actual  Predicted  Difference
0   47700.0   53130.34    -5430.34
1   45800.0  123300.23   -77500.23
2  500001.0  254769.85   245231.15
3  218600.0  268413.59   -49813.59
4  278000.0  265647.54    12352.46
5  158700.0  138681.20    20018.80
6  198200.0  291076.58   -92876.58
7  157500.0  228676.03   -71176.03
8  340000.0  255787.26    84212.74
9  446600.0  408335.02    38264.98
