---
Author: Mustapha Bouhsen <br>
Email : mustapha.bouhsen@habitat-nature.com <br>
[LinkedIn](https://www.linkedin.com/in/mustapha-bouhsen/)<br>
Date: July 15, 2024<br>
---

In [0]:
import pandas as pd

## Create the mount folder

In this section, we create a mount folder to get access the data in the blob storage

In [0]:
# storage_account_name = "datatest42"
# storage_key = "sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupyx&se=2024-08-31T03:31:55Z&st=2024-07-15T19:31:55Z&spr=https&sig=L3Ly%2Fos9foips1W5CkGiyg0hXZc%2BT2FzPN%2FuHOe%2BY58%3D"
# container_name = "database"

In [0]:
# mount_point = "/mnt/database"

# dbutils.fs.mount(
#   source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/",
#   mount_point = mount_point,
#   extra_configs = {f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net":storage_key})

## Recursive File Path Retrieval in Nested Folders

In [0]:
def get_files_paths_from_folders(folder_path, endsWith=None):
    """
    Recursively retrieves the paths of all files within the specified folder and its subfolders.

    Parameters:
    - folder_path (str): The path to the folder for which file paths are to be retrieved.
    - endsWith (list[str], optional): The suffix to filter files by.

    Returns:
    - List[str]: A list containing the paths of all files within the specified folder and its subfolders that end with the specified suffix.
    """
    # Get the list of paths (files and subfolders) within the specified folder
    paths = dbutils.fs.ls(folder_path)

    # Initialize an empty list to store file paths
    my_paths = []

    # Iterate through the paths to identify files and subfolders
    for key in paths:
        # Check if the current path corresponds to a file
        if key.isFile():
            # If it's a file, append its path to the list
            my_paths.append(key[0])
        else:
            # If it's a subfolder, recursively call the function to get file paths within the subfolder
            my_paths = my_paths + get_files_paths_from_folders(key[0])

    if endsWith != None:
        # Filter the list of paths to include only those ending with the specified suffix
        my_paths = [x for x in my_paths if any(ext in x for ext in endsWith)]

    # Return the final list of file paths
    return my_paths


## Function to create file path from the blob storage with the key access

In [0]:
def create_file_paths(path, storage_account_name, blob_key=''):
    """
    Constructs a URL for accessing a file in an Azure Blob Storage container.

    Parameters:
    - path (list of str): A list of strings representing the hierarchical path to the file in the storage container.
    - storage_account_name (str): The name of the Azure Storage account.
    - blob_key (str, optional): The optional key or token for accessing the blob. Default is an empty string.

    Returns:
    - str: The complete URL for accessing the file in Azure Blob Storage.
    """
    my_path = "/".join(path)
    my_path = f"https://{storage_account_name}.blob.core.windows.net/{my_path}?{blob_key}"
    return my_path

##  Function to creates a lookup table (DataFrame) from a list of file paths, parsing specific metadata

In [0]:
def created_lookup_table(files_paths, storage_account_name, blob_key=''):
    """
    Creates a lookup table (DataFrame) from a list of file paths, parsing specific metadata.

    Parameters:
    - files_paths (list of str): A list of file paths in Azure Blob Storage.
    - storage_account_name (str): The name of the Azure Storage account.
    - blob_key (str, optional): The optional key or token for accessing the blob. Default is an empty string.

    Returns:
    - pd.DataFrame: A pandas DataFrame containing parsed metadata columns ('theme', 'country', 'region', 'subject_*', 'year', 'file_path').
    """

    files = [path.split("/")[2:] for path in files_paths]

    # Initialize an empty list to hold the parsed data
    parsed_data = []

    # Iterate over each sublist
    for path in files:
        # Initialize a dictionary to hold the row data
        row_data = {}
        # The third element is the theme
        row_data["theme"] = path[2] 
        # Counter for subjects
        subject_counter = 1

        # Iterate over the remaining elements
        for item in path[3:]:
            if "=" in item:
                key, value = item.split("=")
                if key == 'subject':
                    # Use a counter to create dynamic subject columns
                    key = f"subject_{subject_counter}"
                    subject_counter += 1

                row_data[key] = value
            
            else:
                row_data['file_path'] = create_file_paths(path, storage_account_name, blob_key)

        # Append the row data to the parsed_data list
        parsed_data.append(row_data)
    
    # Create a DataFrame from the parsed data
    df = pd.DataFrame(parsed_data)

    # Ensure the subjects are after the region column
    subject_columns = [col for col in df.columns if col.startswith('subject_')]
    column_order = ['theme', 'country', 'region'] + subject_columns + ["year", "file_path"]
    df = df[column_order]

    return df