# Read Dataset

## Author: Alvin Chong

## Date: 25-11-2025

### Learning Outcomes:

1. Understand how to load datasets directly from GitHub.

2. Learn how to use the `tarfile` module to read `.tgz` files and extract their contents.

3. Use the `keras.utils.get_file` function from TensorFlow to efficiently download and manage datasets.


#### 1. Load `.csv` Dataset from Github

In [2]:
import pandas as pd

# You must use a raw.githubusercontent.com link, so that pandas receives the actual CSV file content instead of a GitHub webpage.
url = "https://raw.githubusercontent.com/ageron/handson-ml/refs/heads/master/datasets/housing/housing.csv"

# Read the dataset from the url link
df = pd.read_csv(url)

# Display the first 3 rows
df.head(1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY


#### 2. Load .tgz file

In [16]:
import urllib.request
import tarfile
from pathlib import Path
import pandas as pd

# Set download URL and destination folder
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"
dest = Path("data")                         # Local folder to store files
dest.mkdir(parents=True, exist_ok=True)     # Create the folder if it doesn't exist

# Download the .tgz file
tgz_path = dest / "housing.tgz"
urllib.request.urlretrieve(url, tgz_path)   # Download file from the website

# Extract the .tgz file
with tarfile.open(tgz_path, "r:*") as tar:
    tar.extractall(dest)                    # Extract all contents to the folder

# Read the CSV data
csv_path = dest / "housing.csv"
df = pd.read_csv(csv_path)                  # Load the extracted CSV file
df.head(1)                                  # Display the first rows

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY


#### 3. Load Dataset using tensorflow

This is usually used in Machine Learning or Large Production Project.

In [19]:
import tensorflow as tf
import zipfile, tarfile
from pathlib import Path
import pandas as pd


def load_file(url, dest="data"):
    # Create destination folder if it doesn't exist
    dest = Path(dest)
    dest.mkdir(parents=True, exist_ok=True)

    # Download the file using keras.utils.get_file
    local_path = tf.keras.utils.get_file(
        fname=url.split("/")[-1],   # Use the file name from the URL
        origin=url,                 # Download from this URL
        cache_dir=str(dest),        # Save inside the chosen folder
        cache_subdir=".",           # Place directly in the folder
        extract=True              # Do not auto-extract
    )

    return dest


# URL for the dataset
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"

# Download, extract, and load the dataset
path2 = load_file(url)
df = pd.read_csv(path2 / "housing.csv")
df.head(1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
