In [0]:
import sys
sys.path.append('/Workspace/Users/pmanoj@depaul.edu')
from config import config
from pyspark.sql.functions import *
from pyspark.sql.window import *
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [0]:
%run "./02_data_loaders"

In [0]:
# =============================================================================
# MEDAL EFFICIENCY TRANSFORMATION FUNCTION
# =============================================================================

def calculate_medal_efficiency(athletes_df, medals_df, config):
    """
    Calculate medal efficiency metrics using centralized configuration.
    
    Args:
        athletes_df: DataFrame with athlete data
        medals_df: DataFrame with medal data
        config: Configuration object with thresholds
        
    Returns:
        DataFrame with efficiency metrics
    """
    try:
        logger.info("Calculating medal efficiency metrics...")

        #Counting the Total Atheletes per country
        athletes_count = athletes_df.groupBy('Country').agg(
            count('PersonName').alias('TotalAtheletes')
        )
        logger.info(f"Calculated athlete count for {athletes_count.count()} countries.")

        #joining the athletes count with medals_df
        combined_data=athletes_count.join(
            broadcast(medals_df), 
            col("Country") == col("TeamCountry"), 
            "inner"
        )
        logger.info(f"Joined data for {combined_data.count()} countries.")
        
        #Calculate Medal Efficiency Metrics using thresholds
        medal_efficiency = combined_data.withColumn(
            "MedalsPerAthlete", 
            when(col("Total") == 0, lit(0.0))
            .otherwise(
                round((col("Total") / col("TotalAtheletes")), 3)
            ).withColumn(
                "MedalEfficiencyScore",
                when(col("MedalsPerAthlete") >= config.MEDAL_EFFICIENCY_HIGH, "High")
                .when(col("MedalsPerAthlete") >= config.MEDAL_EFFICIENCY_MEDIUM, "Medium")
                .otherwise("low")
            ).select(
                "Country", 
                "TotalAtheletes",
                "Gold",
                "Silver",
                "Bronze", 
                "Total", 
                "MedalsPerAthlete", 
                "MedalEfficiencyScore"
            )
        )

        #Applying Caching
        if config.CACHE_DATAFRAMES:
            medal_efficiency.cache()
            logger.info("Medal Efficiency DataFrame cached.")
            
        logger.info(f"Medal Efficiency calculated for {medal_efficiency.count()} countries")
        return medal_efficiency
    
    except Exception as e:
        logger.error(f"Error calculating medal efficiency metrics: {str(e)}")
        raise
    
print("Medal Efficiency Transformation Function Loaded")