In [8]:
import pandas as pd

# You must use a raw.githubusercontent.com link, so that pandas receives the actual CSV file content instead of a GitHub webpage.
url = "https://raw.githubusercontent.com/ageron/handson-ml/refs/heads/master/datasets/housing/housing.csv"

# Read the dataset from the url link
df = pd.read_csv(url)
df.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY


In [7]:
import urllib.request
import tarfile
from pathlib import Path

def load_tgz(url, dest="data"):  # dest: destination
    """
    Download and extract a .tgz dataset from a URL.

    Parameters
    ----------
    url : str
        The direct download URL to the .tgz file.
        This should be a *raw* link when downloading from GitHub
        (i.e., using raw.githubusercontent.com instead of a blob link).

    dest : str or Path, optional (default="data")
        Destination folder where the downloaded .tgz file will be saved
        and extracted. The folder will be created automatically if it
        does not exist.

    Examples
    --------
    url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"
    path = load_tgz(url)
    df = pd.read_csv(path / "housing.csv")
    df.head()
    """

    dest = Path(dest)
    dest.mkdir(parents=True, exist_ok=True)

    # Download the .tgz file
    tgz_path = dest / "dataset.tgz"
    urllib.request.urlretrieve(url, tgz_path)

    # Extract
    with tarfile.open(tgz_path, "r:*") as tar:
        tar.extractall(dest)

    return dest

url = "https://raw.githubusercontent.com/ageron/handson-ml/refs/heads/master/datasets/housing/housing.tgz"
path = load_tgz(url)
df = pd.read_csv(path / "housing.csv")
df.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY


In [6]:
import tensorflow as tf
import zipfile, tarfile
from pathlib import Path
import pandas as pd

def load_file(url, dest="data"):
    dest = Path(dest)
    dest.mkdir(parents=True, exist_ok=True)

    local_path = tf.keras.utils.get_file(
        fname=url.split("/")[-1],
        origin=url,
        cache_dir=str(dest),
        cache_subdir=".",
        extract=False
    )
    local_path = Path(local_path)

    # ZIP
    if local_path.suffix == ".zip":
        with zipfile.ZipFile(local_path, "r") as z:
            z.extractall(dest)

    # TGZ / TAR.GZ
    elif local_path.suffix in [".tgz", ".gz"] or "".join(local_path.suffixes) == ".tar.gz":
        with tarfile.open(local_path, "r:*") as tar:
            tar.extractall(dest)

    return dest

url = "https://raw.githubusercontent.com/ageron/handson-ml/refs/heads/master/datasets/housing/housing.tgz"
path2 = load_file(url)
df = pd.read_csv(path2 / "housing.csv")
df.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
