In [5]:
import pandas as pd

In [6]:
import numpy as np

In [7]:
from sklearn.datasets import fetch_california_housing

In [8]:
# --- 1. FETCH DATA ---
# We use Scikit-Learn to fetch the data automatically
print("Fetching California Housing dataset...")
raw_data = fetch_california_housing()

Fetching California Housing dataset...


In [9]:
# Convert to Pandas DataFrame for easier cleaning
df = pd.DataFrame(raw_data.data, columns=raw_data.feature_names)
df['median_house_value'] = raw_data.target

In [10]:
# --- 2. INSPECT DATA ---
print(f"Original shape: {df.shape}")
print("Checking for missing values...")
print(df.isnull().sum())

Original shape: (20640, 9)
Checking for missing values...
MedInc                0
HouseAge              0
AveRooms              0
AveBedrms             0
Population            0
AveOccup              0
Latitude              0
Longitude             0
median_house_value    0
dtype: int64


In [11]:
# --- 3. CLEANING ---
# Even though this specific dataset is usually clean, 
# this is the standard code you would use for real-world data:

# A. Remove Duplicates
df.drop_duplicates(inplace=True)



In [12]:
# B. Handle Missing Values (Imputation)
# Fill missing numbers with the average (mean) of that column
df.fillna(df.mean(), inplace=True)

In [13]:
# C. Outlier Removal (Optional but recommended for Ridge)
# Let's remove houses that are capped at $500,000 (often a data cap artifact)
# The target value is in units of 100k, so 5.0 = $500,000
df = df[df['median_house_value'] < 5.0]

print(f"Shape after cleaning: {df.shape}")

Shape after cleaning: (19648, 9)


In [14]:
# --- 4. EXPORT ---
# Save to CSV so the next script can pick it up
df.to_csv('cleaned_data.csv', index=False)
print("✅ Data cleaned and saved to 'cleaned_data.csv'")

✅ Data cleaned and saved to 'cleaned_data.csv'
