In [4]:
#QUESTION 1 & 2
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('housing/housing.csv')

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Handle missing values in total_bedrooms
# Since total_bedrooms is correlated with households and total_rooms,
# we can use median imputation by grouping similar properties

# Calculate median bedrooms per household for each ocean_proximity category
bedroom_median_by_location = df.groupby('ocean_proximity')['total_bedrooms'].median()

# Impute missing values
df['total_bedrooms'] = df.apply(
    lambda row: bedroom_median_by_location[row['ocean_proximity']]
    if pd.isna(row['total_bedrooms'])
    else row['total_bedrooms'],
    axis=1
)

# Verify no more missing values
print("\nMissing values after imputation:")
print(df.isnull().sum())

# Save the cleaned dataset
df.to_csv('housing/cleaned_housing_data.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_housing_data.csv'")
# Alternative approach: Simple median imputation (if you prefer)
# df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

Missing values per column:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

Missing values after imputation:
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

Cleaned dataset saved as 'cleaned_housing_data.csv'


In [5]:
#QUESTION 4, HANDLE THE NON-NUMERICAL FIELD
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# One-hot encoding with pandas
ohe_df = pd.get_dummies(df['ocean_proximity'], prefix='ocean')

# Concatenate with original dataframe
df_encoded = pd.concat([df.drop('ocean_proximity', axis=1), ohe_df], axis=1)

print("One-Hot Encoding Results:")
print(ohe_df.head())
print(f"\nNew columns: {list(ohe_df.columns)}")

One-Hot Encoding Results:
   ocean_<1H OCEAN  ocean_INLAND  ocean_ISLAND  ocean_NEAR BAY  \
0            False         False         False            True   
1            False         False         False            True   
2            False         False         False            True   
3            False         False         False            True   
4            False         False         False            True   

   ocean_NEAR OCEAN  
0             False  
1             False  
2             False  
3             False  
4             False  

New columns: ['ocean_<1H OCEAN', 'ocean_INLAND', 'ocean_ISLAND', 'ocean_NEAR BAY', 'ocean_NEAR OCEAN']
