In [0]:
import sys
sys.path.append('/Workspace/Users/pmanoj@depaul.edu')
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [0]:
%run "./02_data_loaders"

In [0]:
def calculate_sport_dominance(athletes_df, config):
    """
    Calculates the sport dominance of each country.
    """
    try:
        logger.info("Calculating Sports Dominance for all Countries")

        AthletesCountInSport = athletes_df.groupBy("Country", "Discipline").agg(
            count("PersonName").alias("AthletesCountInEachSport")
        )
        TotalAthletesCount = athletes_df.groupBy("Country").agg(
            count("PersonName").alias("TotalAthletesCount")
        )
        
        SportsDominanceEfficiency = AthletesCountInSport.join(
            TotalAthletesCount, on="Country", how = "inner"
        ).withColumn(
            "SportDominanceEfficiencyPercent",
            round( col("AthletesCountInEachSport")/col("TotalAthletesCount") * 100
                  ,2)
        ).withColumn(
            "IsSpecialized",
            when (col("SportDominanceEfficiencyPercent") >= 25, True).otherwise(False)
        ).withColumn(
            "StrategyType",
            when( col("SportDominanceEfficiencyPercent") >= 50,"High Risk/High Reward")
            .when( col("SportDominanceEfficiencyPercent") >= 30, "Focused Strategy")
            .when( col("SportDominanceEfficiencyPercent") >= 25, "Balanced Approach")
            .otherwise("Diversified Strategy")
        ).select(
            "Country",
            "Discipline",
            "AthletesCountInEachSport",
            "TotalAthletesCount",
            "SportDominanceEfficiencyPercent",
            "IsSpecialized",
            "StrategyType"
        )
        if config.CACHE_DATAFRAMES:
            SportsDominanceEfficiency.cache()
            logger.info("Sports Dominance Efficiency cached")
        
        logger.info(f"sports dominance Efficiency calculated for {SportsDominanceEfficiency.count()} countries")
        return SportsDominanceEfficiency

    except Exception as e:
        logger.error(f"Error calculating sport dominance: {e}")
        raise

In [0]:
try:
    sports_dominance_efficiency = calculate_sport_dominance(athletes_df, config)
    
    print("Top 10 countries with Highest Number of Atheletes and their Respective sport")
    sports_dominance_efficiency.select("Country", "Discipline", "AthletesCountInEachSport").orderBy(col("AthletesCountInEachSport").desc()).show(10)
    
    print("Top 10 Sport Dominant Countries and their Strategy type")
    sports_dominance_efficiency.select("Country", "Discipline","SportDominanceEfficiencyPercent", "StrategyType").orderBy(col("SportDominanceEfficiencyPercent").desc()).show(10)

    print("Sport Dominance efficiency pipelines sucessfully completed")

except Exception as e:
    logger.error(f"Error calculating sport dominance: {e}")
    raise