In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:

import sys
sys.path.append('/Workspace/Users/pmanoj@depaul.edu')
from config import config

from pyspark.sql.functions import *
from pyspark.sql.window import Window
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [0]:
%run "./02_data_loaders"

In [0]:
def analyze_gender_participation(gender_df, config):
    """
    This function takes in the gender_df and config and returns a Spark DataFrame with the following columns:
    """
    try:
        logger.info("Starting the Gender analysis Pipeline")

        GenderMetrics = gender_df.withColumn(
            "FemalePercentage",
            (col("Female") / col("Total")) * 100
        ).withColumn(
            "MalePercentage",
            (col("Male") / col("Total")) * 100
        ).withColumn(
            "GenderGap",
            abs(col("FemalePercentage") - col("MalePercentage"))
        ).withColumn(
            "genderCategory",
            when(col("FemalePercentage") > col("malePercentage"), "Female Dominant")
            .when(col("malePercentage") == col("FemalePercentage"), "Balanced")
            .otherwise("Male Dominant")
        ).withColumn(
            "SportSizeRank", 
            dense_rank().over(Window.orderBy(col("Total").desc()))
        ).withColumn(
            "SportSizeCategory",
            when(col("SportSizeRank") <= 10, "Large Sport")
            .when(col("SportSizeRank") <= 20, "Medium Sport") 
            .otherwise("Small Sport")
        ).withColumn(
            "OpportunityType",
            when((col("FemalePercentage") >= 55) & (col("SportSizeCategory") == "Large Sport"), "High Female Opportunity")
            .when((col("MalePercentage") >= 55) & (col("SportSizeCategory") == "Large Sport"), "High Male Opportunity")
            .when((col("FemalePercentage") >= 60) & (col("SportSizeCategory") == "Medium Sport"), "Medium Female Opportunity")
            .when((col("MalePercentage") >= 60) & (col("SportSizeCategory") == "Medium Sport"), "Medium Male Opportunity")
            .when(col("FemalePercentage").between(45, 55), "Balanced Competition")
            .otherwise("Standard Competition")
        )

        if config.CACHE_DATAFRAMES:
            GenderMetrics.cache()

        return GenderMetrics
    except Exception as e:
        logger.error(f"Error in analyze_gender_participation: {e}")
        raise


In [0]:

try:
    
    logger.info("EXECUTING GENDER ANALYSIS PIPELINE")
 
    gender_analysis_results = analyze_gender_participation(gender_df, config)
    
    print("=== GENDER ANALYSIS RESULTS ===")
    print(f"Total sports analyzed: {gender_analysis_results.count()}")
    
    print("\n=== TOP 10 MOST IMBALANCED SPORTS ===")
    gender_analysis_results.orderBy(col("GenderGap").desc()).show(10)
    
    print("\n=== GENDER CATEGORY BREAKDOWN ===")
    gender_analysis_results.groupBy("genderCategory").count().orderBy(col("count").desc()).show()
    
    print("\n=== OPPORTUNITY SPORTS ===")
    print("High opportunity sports for strategic planning:")
    gender_analysis_results.filter(
        col("OpportunityType").contains("Opportunity")
    ).orderBy(col("Total").desc()).show()
    
    print("\n=== FEMALE-DOMINATED LARGE SPORTS ===")
    print("Sports where women dominate in major competitions:")
    gender_analysis_results.filter(
        (col("GenderCategory") == "Female Dominated") & 
        (col("SportSizeCategory") == "Large Sport")
    ).show()
    
    print("\n=== MALE-DOMINATED LARGE SPORTS ===")
    print("Sports where men dominate in major competitions:")
    gender_analysis_results.filter(
        (col("GenderCategory") == "Male Dominated") & 
        (col("SportSizeCategory") == "Large Sport")
    ).show()
    
    print("\n=== BALANCED SPORTS (EQUAL OPPORTUNITY) ===")
    print("Sports with gender balance - equal competition:")
    gender_analysis_results.filter(
        col("GenderCategory") == "Balanced"
    ).orderBy(col("Total").desc()).show()
    
    logger.info("Gender analysis pipeline completed successfully")
    
except Exception as e:
    logger.error(f"Pipeline execution failed: {e}")
    raise

In [0]:
print(" LARGE SPORTS BY GENDER ")
gender_analysis_results.filter(col("SportSizeCategory") == "Large Sport").select(
    "Discipline", "FemalePercentage", "MalePercentage", "genderCategory", "SportSizeRank"
).orderBy("SportSizeRank").show()

print("ALL MALE DOMINATED SPORTS ")
gender_analysis_results.filter(col("genderCategory") == "Male Dominant").select(
    "Discipline", "SportSizeCategory", "MalePercentage"
).orderBy(col("MalePercentage").desc()).show()

print("ALL FEMALE DOMINATED SPORTS ")
gender_analysis_results.filter(col("genderCategory") == "Female Dominant").select(
    "Discipline", "SportSizeCategory", "FemalePercentage"
).show()