In [0]:
# =============================================================================
# DATA LOADING MODULE - Load Olympic Datasets from Azure Storage
# =============================================================================

# Import our config
import sys
sys.path.append('/Workspace/Users/pmanoj@depaul.edu')
from config import config

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("Data loading module initialized")
print(f"Cache setting: {config.CACHE_DATAFRAMES}")

INFO:py4j.clientserver:Received command c on object id p0


Data loading module initialized
Cache setting: True


In [0]:

#Storage Configurations 
storage_account_name = "tokyoolympicdatamegha"
container_name = "tokyo-olympic-data"
access_key = "55fIbG8NXjwmjgmXr8GsN6c5A+PQc3pI4ZQdKjUg/VOTPkpRpzvU8OHrp59KF5eA9pEKAHr6cdgi+AStH+5DOw=="

spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", 
    access_key)

logger.info("Spark config set successfully")

INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Spark config set successfully


In [0]:
def load_olympic_datasets():
    """
    Load all Olympic datasets from Azure blob storage.
    
    Returns:
        Dictionary containing all DataFrames: athletes_df, medals_df, etc.
    """
    try:
        logger.info("Loading datasets from Azure blob storage...")

        storage_account_name = "tokyoolympicdatamegha"
        container_name = "tokyo-olympic-data"
        base_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/raw/"
        

        #Load Datasets
        athletes_df = spark.read.csv(f"{base_path}Atheletes", header=True, inferSchema=True)
        coaches_df = spark.read.csv(f"{base_path}Coaches", header=True, inferSchema=True)
        gender_df = spark.read.csv(f"{base_path}GenderEntriesData", header=True, inferSchema=True)
        medals_df = spark.read.csv(f"{base_path}Medals", header=True, inferSchema=True)
        teams_df = spark.read.csv(f"{base_path}Teams", header=True, inferSchema=True)

        #Caching DataFrames
        if config.CACHE_DATAFRAMES:
            athletes_df.cache()
            coaches_df.cache()
            gender_df.cache()
            medals_df.cache()
            teams_df.cache()
        
        logger.info("Datasets Cached for Performance")

        #verify DataFrames
        assert athletes_df.count() > 0, "Athletes DataFrame is empty"
        assert coaches_df.count() > 0, "Coaches DataFrame is empty"
        assert gender_df.count() > 0, "Gender DataFrame is empty"
        assert medals_df.count() > 0, "Medals DataFrame is empty"
        assert teams_df.count() > 0, "Teams DataFrame is empty"

        return athletes_df, coaches_df, gender_df, medals_df, teams_df
    
    except Exception as e:
        logger.error(f"Error loading datasets: {e}")
        raise

# Load datasets
athletes_df, coaches_df, gender_df, medals_df, teams_df = load_olympic_datasets()

logger.info("Datasets loaded successfully")

INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Loading datasets from Azure blob storage...
INFO:__main__:Base path: wasbs://tokyo-olympic-data@tokyoolympicdatamegha.blob.core.windows.net/raw/
INFO:__main__:Datasets Cached for Performance
INFO:__main__:Datasets loaded successfully
