In [1]:
from typing import List
import requests
import re
from bs4 import BeautifulSoup

def get_table_from_link(url: str, class_: str) -> List[str]:
    """
    Extract table data from a web page by scraping elements with a specific CSS class.
    
    Parameters
    ----------
    url : str
        The URL of the web page to scrape.
    class_ : str
        The CSS class name to search for within table cells.
        
    Returns
    -------
    List[str]
        A list of BeautifulSoup Tag objects containing the matched table cells.
        
    Notes
    -----
    This function assumes the target table has an id="list" attribute.
    It searches for <td> elements within that table matching the specified class.
    """
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    table_ = soup.find(id = "list")
    list_ = table_.find_all("td", class_=class_)
    return list_

def find_data_storage(url: str, pattern: str) -> float:
    """
    Calculate total storage requirements from size data scraped from a web page.
    
    Parameters
    ----------
    url : str
        The URL of the web page containing size information.
    pattern : str
        Regex pattern parameter (currently unused - function uses hardcoded pattern).
        
    Returns
    -------
    float
        Total storage size converted to megabytes (MB).
        
    Notes
    -----
    The function searches for table cells with class="size", extracts numeric values
    from text matching the pattern numbers with decimal points, and
    sums them. The conversion factor 0.001024 is applied, suggesting conversion
    from KiB to MB using binary conversion (1024 bytes per KiB, then /1000).
    """
    storage_list = get_table_from_link(url, class_="size")
    
    total_storage = 0
    for itr in storage_list:
        pattern = re.compile(pattern)
        if pattern.match(itr.text):
            storage_per_file = float(itr.text.split(" ")[0])
            total_storage += storage_per_file

    return total_storage * 0.001024 # converting to MB


def find_tiff_url(url: str, pattern: str) -> List[str]:
    """
    Extract and construct URLs matching a specified pattern from a web page.
    
    Parameters
    ----------
    url : str
        The base URL of the web page to scrape.
    pattern : str
        Regex pattern to match against href attributes in links.
        
    Returns
    -------
    List[str]
        A list of complete URLs constructed by combining the base URL
        with matching href values.
        
    Notes
    -----
    The function searches for table cells with class="link", extracts href
    attributes from anchor tags within those cells, and filters them using
    the provided regex pattern. Complete URLs are formed by concatenating
    the base URL with the matching href values.
    
    Assumes each link cell contains at least one anchor tag with an href attribute.
    """
    links = get_table_from_link(url, class_ = "link")

    all_url = []
    for link in links:
        temp_url = link.find_all(href = True)[0]['href']
        pattern = re.compile(pattern)
        if pattern.match(temp_url):
            all_url.append(url + temp_url)

    return all_url

url = "https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/"
year_urls = find_tiff_url(url, pattern = r"\d{4}\/")

# get links to all TIFF files
# for year in year_urls:
#     data_urls = find_tiff_url(year, pattern = r"chirps-.*")
#     print(data_urls)


# # get storage requirements for all tiff files
# total_storage = 0
# for year in year_urls:
#     # the storage output from this function is already in MB
#     total_storage += find_data_storage(url = year, pattern = r"\d+\..+")

# total_storage = total_storage * 0.001 # converting to GB
# print(f"Total Storage required by the CHIRPS zip files: {total_storage:.2f} GB")

# print("The above storage is taken up by compressed files, for a better estimate, we use the conversion factor of 12.735(obtained from downloading one file)")
# print(f"Total true storage requirement for the CHIRPS dataset: {(total_storage * 12.735):.2f} GB")

In [2]:
# get links to all TIFF files
# data_urls is a list of a list with 45 years worth of data from 1981-2025
# where index 0 has all data for 1981 and index 1 has 1982 ... index 44 has 2025
data_urls = []
for year in year_urls:
    data_urls.append(find_tiff_url(year, pattern = r"chirps-.*"))

In [3]:
data_urls[0][0]

'https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/1981/chirps-v2.0.1981.01.01.tif.gz'

In [4]:
# unzip file
import gzip

def unzip_file(url: str) -> bytes:
    """
    Opens an object at a given url, and returns a decompressed byte object

    Parameters
    -----------
    url : str
        The base url to the source file
    
    Returns
    -------
    bytes
        Decompressed byte object
    """
    unzipped_file = requests.get(url) 
    if unzipped_file.status_code == 200:
        decompressed_file = gzip.decompress(unzipped_file.content)
    
    return decompressed_file

url = data_urls[0][1]
y = "1981/"

# getting file name from url
file_name = url.split(y)[1].replace(".gz", "")

# I don't need to save this, I could just as well pass a decompressed byte object to downstream tasks
decompressed_file = unzip_file(url)
with open(file_name, "wb") as f:
    f.write(decompressed_file)

In [None]:
# clipping tiff to Nigeria specific bounding box
import rasterio
from rasterio.windows import from_bounds
from rasterio.enums import Resampling
from rasterio.crs import CRS
from rasterio.warp import transform_bounds

def clip_to_cog(input_tiff: str, clipped_tiff: str, bbox: list, bbox_crs: str):
    """
    Clips a GeoTIFF to a specified bounding box, handling differing CRS,
    and saves it as a Cloud-Optimized GeoTIFF (COG).

    Args:
        input_tiff: Path to the source GeoTIFF file.
        clipped_tiff: Path for the output clipped COG file.
        bbox: A list representing the bounding box in the format
              [min_x, min_y, max_x, max_y].
        bbox_crs: The Coordinate Reference System of the provided bounding box,
                  defaulting to WGS84 ('EPSG:4326').
    """
    try:
        with rasterio.open(input_tiff) as src:
        
            # Get the CRS of the source raster
            src_crs = src.crs
            
            # Reproject the bounding box if the CRS are different
            if CRS.from_string(bbox_crs) != src_crs:
                left, bottom, right, top = transform_bounds(
                    CRS.from_string(bbox_crs),
                    src_crs,
                    *bbox
                )
                reprojected_bbox = [left, bottom, right, top]
            else:
                reprojected_bbox = bbox
        
        
            window = from_bounds(*reprojected_bbox, src.transform)
            data = src.read(window=window)
            window_transform = src.window_transform(window)

            profile = src.profile.copy()
            profile.update({
                'height': window.height, 
                'width': window.width, 
                'transform': window_transform,
                'tiled': True, 
                'blockxsize': 512, 
                'blockysize': 512,
                'compress': 'deflate'
            })

            # write COG
            with rasterio.open(clipped_tiff, 'w', **profile) as dst:
                dst.write(data)

                factors =  [2, 4, 8, 16]
                dst.build_overviews(factors, Resampling.average)
                dst.update_tags(ns='rio_overview', resampling='average')
                print(f"File written to {clipped_tiff}")
    except Exception as e:
        print(f"An error has occurred: {e}")

In [None]:
# iterate through all the years, and convert to COGS
directory = "nigeria_tifs/"

# data_urls is the list of all urls as described above
for i, data in enumerate(data_urls):
    for url in data:
        y = str(i + 1981) + "/"
        
        # getting file name from url
        file_name = url.split(y)[1].replace(".gz", "")

        decompressed_file = unzip_file(url)
        
        # full path of the output tif files
        path = directory + file_name
        with open(file_name, "wb") as f:
            f.write(decompressed_file)
        
        clipped_tiff = f"{directory}cogs/" + f"nigeria-cog-{file_name}" 
        bbox_aoi = [2.316388, 3.837669, 15.126447, 14.153350]
        bbox_crs = "EPSG:4326"
        
        clip_to_cog(file_name, clipped_tiff, bbox_aoi, bbox_crs)

File written to nigeria_tifs//cogs/nigeria-cog-chirps-v2.0.1981.01.01.tif
File written to nigeria_tifs//cogs/nigeria-cog-chirps-v2.0.1981.01.02.tif
File written to nigeria_tifs//cogs/nigeria-cog-chirps-v2.0.1981.01.03.tif
File written to nigeria_tifs//cogs/nigeria-cog-chirps-v2.0.1981.01.04.tif
File written to nigeria_tifs//cogs/nigeria-cog-chirps-v2.0.1981.01.05.tif
File written to nigeria_tifs//cogs/nigeria-cog-chirps-v2.0.1981.01.06.tif
File written to nigeria_tifs//cogs/nigeria-cog-chirps-v2.0.1981.01.07.tif
File written to nigeria_tifs//cogs/nigeria-cog-chirps-v2.0.1981.01.08.tif
File written to nigeria_tifs//cogs/nigeria-cog-chirps-v2.0.1981.01.09.tif
File written to nigeria_tifs//cogs/nigeria-cog-chirps-v2.0.1981.01.10.tif


KeyboardInterrupt: 