In [None]:
%pip install -q numpy pandas matplotlib scikit-learn

TASK TWO - Dataset Collection & Preparation

Load dataset into a DataFrame

In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

housing = fetch_california_housing()

df = pd.DataFrame(housing.data, columns=housing.feature_names)
df["MedHouseVal"] = housing.target

df.head()

Save a local copy (counts as “dataset collection”)

In [None]:
import os

os.makedirs("../data", exist_ok=True)
df.to_csv("../data/housing.csv", index=False)
print("Saved to ../data/housing.csv")

Check shape + columns

In [None]:
print("Shape:", df.shape)

print(df.columns)

Check data types and non-null counts

In [None]:
df.info()

Check missing values

In [None]:
df.isnull().sum()

Check duplicate rows

In [None]:
df.duplicated().sum()

Quick summary statistics

In [None]:
df.describe().T

Outlier scan 

In [None]:
df.quantile([0.01, 0.99]).T

Target distribution (MedHouseVal)

In [None]:
import matplotlib.pyplot as plt

plt.figure()
df["MedHouseVal"].hist(bins=30)
plt.title("Distribution of Median House Value")
plt.xlabel("MedHouseVal")
plt.ylabel("Count")
plt.show()

Correlation heatmap

In [None]:
import numpy as np

corr = df.corr(numeric_only=True)

plt.figure(figsize=(10,6))
plt.imshow(corr, aspect="auto")
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()

Relationship checks

In [None]:
for col in ["MedInc", "HouseAge", "AveRooms", "Population", "Latitude", "Longitude"]:
    plt.figure()
    plt.scatter(df[col], df["MedHouseVal"], s=5)
    plt.title(f"{col} vs MedHouseVal")
    plt.xlabel(col)
    plt.ylabel("MedHouseVal")
    plt.show()

Identify strongest correlations with target

In [None]:
corr_target = corr["MedHouseVal"].sort_values(ascending=False)
corr_target