# Download Data

In [20]:
import os
import tarfile
import requests

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
SAVE_DIR = "/home/mburu/Desktop/ML-30-abbas_ali/lessons_scikit/data"

def download_dataset(file_path, download_root=DOWNLOAD_ROOT, save_dir=SAVE_DIR, extract=False):
    """
    Downloads a file from a specified URL and optionally extracts it if it's a .tgz file.
    
    Parameters:
    - file_path (str): The relative path to the file from DOWNLOAD_ROOT (e.g., 'datasets/housing/housing.tgz').
    - download_root (str): The root URL to download from.
    - save_dir (str): Directory to save the downloaded file.
    - extract (bool): If True, extracts the file if it's a .tgz.
    """
    url = download_root + file_path
    file_name = os.path.basename(file_path)
    save_path = os.path.join(save_dir, file_name)
    
    try:
        # Ensure the save directory exists
        os.makedirs(save_dir, exist_ok=True)
        
        # Downloading the file
        print(f"Starting download from {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(save_path, "wb") as file:
            file.write(response.content)
        print(f"Download completed: {save_path}")
        
        # Check file size
        file_size = os.path.getsize(save_path)
        print(f"Downloaded file size: {file_size} bytes")

        # Extract if needed and if the file is a .tgz
        if extract and file_name.endswith(".tgz"):
            print("Starting extraction...")
            with tarfile.open(save_path) as tar:
                tar.extractall(path=save_dir)
            print("Extraction completed.")
    
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
    except tarfile.TarError as e:
        print(f"Error extracting file: {e}")

# Example usage
download_dataset("datasets/housing/housing.tgz", extract=True)


Starting download from https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz...
Download completed: /home/mburu/Desktop/ML-30-abbas_ali/lessons_scikit/data/housing.tgz
Downloaded file size: 409488 bytes
Starting extraction...
Extraction completed.


  tar.extractall(path=save_dir)


## Load data

In [21]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
 csv_path = os.path.join(housing_path, "housing.csv")
 return pd.read_csv(csv_path)

In [22]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


##Data Visuals

SyntaxError: invalid syntax (2125240012.py, line 1)