In [3]:
from typing import List
import requests
import re
from bs4 import BeautifulSoup

def get_table_from_link(url: str, class_: str) -> List[str]:
    """
    Extract table data from a web page by scraping elements with a specific CSS class.
    
    Parameters
    ----------
    url : str
        The URL of the web page to scrape.
    class_ : str
        The CSS class name to search for within table cells.
        
    Returns
    -------
    List[str]
        A list of BeautifulSoup Tag objects containing the matched table cells.
        
    Notes
    -----
    This function assumes the target table has an id="list" attribute.
    It searches for <td> elements within that table matching the specified class.
    """
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    table_ = soup.find(id = "list")
    list_ = table_.find_all("td", class_=class_)
    return list_

def find_data_storage(url: str, pattern: str) -> float:
    """
    Calculate total storage requirements from size data scraped from a web page.
    
    Parameters
    ----------
    url : str
        The URL of the web page containing size information.
    pattern : str
        Regex pattern parameter (currently unused - function uses hardcoded pattern).
        
    Returns
    -------
    float
        Total storage size converted to megabytes (MB).
        
    Notes
    -----
    The function searches for table cells with class="size", extracts numeric values
    from text matching the pattern numbers with decimal points, and
    sums them. The conversion factor 0.001024 is applied, suggesting conversion
    from KiB to MB using binary conversion (1024 bytes per KiB, then /1000).
    """
    storage_list = get_table_from_link(url, class_="size")
    
    total_storage = 0
    for itr in storage_list:
        pattern = re.compile(pattern)
        if pattern.match(itr.text):
            storage_per_file = float(itr.text.split(" ")[0])
            total_storage += storage_per_file

    return total_storage * 0.001024 # converting to MB


def find_tiff_url(url: str, pattern: str) -> List[str]:
    """
    Extract and construct URLs matching a specified pattern from a web page.
    
    Parameters
    ----------
    url : str
        The base URL of the web page to scrape.
    pattern : str
        Regex pattern to match against href attributes in links.
        
    Returns
    -------
    List[str]
        A list of complete URLs constructed by combining the base URL
        with matching href values.
        
    Notes
    -----
    The function searches for table cells with class="link", extracts href
    attributes from anchor tags within those cells, and filters them using
    the provided regex pattern. Complete URLs are formed by concatenating
    the base URL with the matching href values.
    
    Assumes each link cell contains at least one anchor tag with an href attribute.
    """
    links = get_table_from_link(url, class_ = "link")

    all_url = []
    for link in links:
        temp_url = link.find_all(href = True)[0]['href']
        pattern = re.compile(pattern)
        if pattern.match(temp_url):
            all_url.append(url + temp_url)

    return all_url

url = "https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/"
year_urls = find_tiff_url(url, pattern = r"\d{4}\/")

# get links to all TIFF files
# for year in year_urls:
#     data_urls = find_tiff_url(year, pattern = r"chirps-.*")
#     print(data_urls)


# # get storage requirements for all tiff files
# total_storage = 0
# for year in year_urls:
#     # the storage output from this function is already in MB
#     total_storage += find_data_storage(url = year, pattern = r"\d+\..+")

# total_storage = total_storage * 0.001 # converting to GB
# print(f"Total Storage required by the CHIRPS zip files: {total_storage:.2f} GB")

# print("The above storage is taken up by compressed files, for a better estimate, we use the conversion factor of 12.735(obtained from downloading one file)")
# print(f"Total true storage requirement for the CHIRPS dataset: {(total_storage * 12.735):.2f} GB")

In [4]:
# get links to all TIFF files
# data_urls is a list of a list with 45 years worth of data from 1981-2025
# where index 0 has all data for 1981 and index 1 has 1982 ... index 44 has 2025
# converting to a list of dicts to make it easy to work with downstream
data_urls = []
for i, year in enumerate(year_urls):
    # urls per year
    urls = find_tiff_url(year, pattern = r"chirps-.*")
    data_urls.append({"year": str(i + 1981), "urls": urls})
    

In [5]:
# unzip file
import gzip

def unzip_file(url: str) -> bytes:
    """
    Opens an object at a given url, and returns a decompressed byte object

    Parameters
    -----------
    url : str
        The base url to the source file
    
    Returns
    -------
    bytes
        Decompressed byte object
    """
    unzipped_file = requests.get(url) 
    if unzipped_file.status_code == 200:
        decompressed_file = gzip.decompress(unzipped_file.content)
    
    return decompressed_file

In [6]:
# clipping tiff to Nigeria specific bounding box
import rasterio
from rasterio.windows import from_bounds
from rasterio.enums import Resampling
from rasterio.crs import CRS
from rasterio.warp import transform_bounds

def clip_to_cog(input_tiff: str, clipped_tiff: str, bbox: list, bbox_crs: str):
    """
    Clips a GeoTIFF to a specified bounding box, handling differing CRS,
    and saves it as a Cloud-Optimized GeoTIFF (COG).

    Args:
        input_tiff: Path to the source GeoTIFF file.
        clipped_tiff: Path for the output clipped COG file.
        bbox: A list representing the bounding box in the format
              [min_x, min_y, max_x, max_y].
        bbox_crs: The Coordinate Reference System of the provided bounding box,
                  defaulting to WGS84 ('EPSG:4326').
    """
    try:
        with rasterio.open(input_tiff) as src:
        
            # Get the CRS of the source raster
            src_crs = src.crs
            
            # Reproject the bounding box if the CRS are different
            if CRS.from_string(bbox_crs) != src_crs:
                left, bottom, right, top = transform_bounds(
                    CRS.from_string(bbox_crs),
                    src_crs,
                    *bbox
                )
                reprojected_bbox = [left, bottom, right, top]
            else:
                reprojected_bbox = bbox
        
        
            window = from_bounds(*reprojected_bbox, src.transform)
            data = src.read(window=window)
            window_transform = src.window_transform(window)

            profile = src.profile.copy()
            profile.update({
                'height': window.height, 
                'width': window.width, 
                'transform': window_transform,
                'tiled': True, 
                'blockxsize': 512, 
                'blockysize': 512,
                'compress': 'deflate'
            })

            # write COG
            with rasterio.open(clipped_tiff, 'w', **profile) as dst:
                dst.write(data)

                factors =  [2, 4, 8, 16]
                dst.build_overviews(factors, Resampling.average)
                dst.update_tags(ns='rio_overview', resampling='average')
    except Exception as e:
        print(f"An error has occurred: {e}")

In [8]:
def decompress_convert_to_cog(work_item: dict, directory: str):
    """
    Download, decompress, and convert a single CHIRPS rainfall data file to Cloud Optimized GeoTIFF (COG) format.
    
    This function processes one rainfall data file by downloading it from a URL, decompressing the .gz file,
    writing it to disk, and then clipping it to Nigeria's bounding box before converting to COG format.
    
    Parameters
    ----------
    work_item : dict
        Dictionary containing file processing information with the following keys:
        - 'url' : str
            Full URL to the .tif.gz file to be downloaded and processed
        - 'year' : str
            Year string (e.g., '1981') used for filename extraction from URL path
    directory : str
        Base directory path where the processed files will be saved. Should end with '/'.
        The function will save the intermediate .tif file in this directory and the final
        COG file in the 'cogs/' subdirectory.
    
    Returns
    -------
    None
        This function does not return any value. It performs file I/O operations and
        creates processed files on disk.
    
    Note
    ----
    The Nigeria bounding box coordinates are hardcoded as:
    [2.316388, 3.837669, 15.126447, 14.153350] in EPSG:4326 CRS.
    """
    url = work_item['url']
    year = work_item['year']
    year_dir = str(year) + "/"
    
    # getting file name from url
    file_name = url.split(year_dir)[1].replace(".gz", "")
    decompressed_file = unzip_file(work_item['url'])
    
    # full path of the output tif files
    full_path_to_file = directory + file_name
    
    with open(full_path_to_file, "wb") as f:
        f.write(decompressed_file)
    
    clipped_tiff = f"{directory}cogs/" + f"nigeria-cog-{file_name}" 
    bbox_aoi = [2.316388, 3.837669, 15.126447, 14.153350]
    bbox_crs = "EPSG:4326"
    clip_to_cog(full_path_to_file, clipped_tiff, bbox_aoi, bbox_crs)    



# iterate through all the years, and convert to COGS
directory = "nigeria_tifs/"

# for parallel workflow, convert to a flat list from the nested data_urls list
work_items = []
for data in data_urls:
    for url in data['urls']:
        work_items.append({"year": data['year'], "url": url})


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import tqdm

failed_files = []

with ThreadPoolExecutor(max_workers=20) as executor:
    # Create futures and map them to work items
    future_to_item = {}
    for item in work_items:
        future = executor.submit(decompress_convert_to_cog, item, directory)
        future_to_item[future] = item
    
    for future in tqdm.tqdm(as_completed(future_to_item.keys()), total=len(future_to_item),  desc="Processing files"):
        work_item = future_to_item[future]
        try:
            future.result() 
        except Exception as e:
            failed_files.append(work_item)
            print(f"Failed: {work_item['url']} - Error: {str(e)}")

print(f"\nCompleted! {len(failed_files)} files failed out of {len(work_items)} total")

Processing files:  61%|██████    | 9980/16314 [24:27<116:15:43, 66.08s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2008/chirps-v2.0.2008.05.02.tif.gz - Error: [SYS] unknown error (_ssl.c:2559)
Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2008/chirps-v2.0.2008.05.11.tif.gz - Error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2008/chirps-v2.0.2008.05.12.tif.gz - Error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2008/chirps-v2.0.2008.05.10.tif.gz - Error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Processing files:  66%|██████▌   | 10686/16314 [27:14<21:19,  4.40it/s]   

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2008/chirps-v2.0.2008.05.14.tif.gz - Error: HTTPSConnectionPool(host='data.chc.ucsb.edu', port=443): Read timed out. (read timeout=None)


Processing files:  66%|██████▌   | 10706/16314 [27:18<15:40,  5.96it/s]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2008/chirps-v2.0.2008.05.13.tif.gz - Error: HTTPSConnectionPool(host='data.chc.ucsb.edu', port=443): Read timed out. (read timeout=None)
Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2008/chirps-v2.0.2008.05.15.tif.gz - Error: HTTPSConnectionPool(host='data.chc.ucsb.edu', port=443): Read timed out. (read timeout=None)


Processing files:  87%|████████▋ | 14222/16314 [48:32<39:56,  1.15s/it]   

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2019/chirps-v2.0.2019.08.27.tif.gz - Error: HTTPSConnectionPool(host='data.chc.ucsb.edu', port=443): Read timed out. (read timeout=None)


Processing files:  92%|█████████▏| 14928/16314 [56:22<15:09,  1.52it/s]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.01.tif - Error: Not a gzipped file (b'II')
Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.02.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14929/16314 [56:24<21:30,  1.07it/s]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.04.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14930/16314 [56:24<17:43,  1.30it/s]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.03.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14931/16314 [56:26<23:20,  1.01s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.05.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14932/16314 [56:27<19:10,  1.20it/s]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.06.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14933/16314 [56:28<25:45,  1.12s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.07.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14934/16314 [56:29<21:02,  1.09it/s]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.08.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14935/16314 [56:30<25:27,  1.11s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.09.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14936/16314 [56:31<23:53,  1.04s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.10.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14937/16314 [56:32<23:45,  1.04s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.11.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14938/16314 [56:34<30:27,  1.33s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.13.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14939/16314 [56:36<34:28,  1.50s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.14.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14940/16314 [56:38<37:16,  1.63s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.15.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14941/16314 [56:39<28:13,  1.23s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.12.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14942/16314 [56:40<30:48,  1.35s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.16.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14943/16314 [56:42<35:02,  1.53s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.17.tif - Error: Not a gzipped file (b'II')
Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.18.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14945/16314 [56:44<30:09,  1.32s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.20.tif - Error: Not a gzipped file (b'II')
Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.19.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14947/16314 [56:46<27:10,  1.19s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.21.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14948/16314 [56:47<24:08,  1.06s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.22.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14949/16314 [56:48<26:16,  1.15s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.23.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14950/16314 [56:50<30:55,  1.36s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.25.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14951/16314 [56:51<25:58,  1.14s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.24.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14952/16314 [56:52<27:19,  1.20s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.26.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14953/16314 [56:53<28:02,  1.24s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.27.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14954/16314 [56:54<23:52,  1.05s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.28.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14955/16314 [56:56<26:40,  1.18s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.29.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14956/16314 [56:57<31:33,  1.39s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.31.tif - Error: Not a gzipped file (b'II')


Processing files:  92%|█████████▏| 14957/16314 [56:59<31:19,  1.39s/it]

Failed: https://data.chc.ucsb.edu/products/CHIRPS-2.0/africa_daily/tifs/p05/2021/chirps-v2.0.2021.12.30.tif - Error: Not a gzipped file (b'II')


Processing files:  93%|█████████▎| 15141/16314 [2:52:26<13:21,  1.46it/s]
