In [1]:
from pyspark.sql import functions as F

def get_top_clubs_with_contracts_ending(spark, X, Y, Z):
    connection_properties = {
        'user': 'your_username',
        'password': 'your_password',
        'driver': 'org.postgresql.Driver'
    }
    
    df = spark.read.jdbc(
        url='jdbc:postgresql://localhost:5432/your_database',
        table='players',
        properties=connection_properties
    )

    filtered_df = df.filter(
        (F.col('club_contract_valid_until') >= Z) & 
        (F.year(F.col('club_joined')) == X)
    )

    clubs_count = filtered_df.groupBy('club_name').count()
    top_clubs = clubs_count.orderBy('count', ascending=False).limit(Y)
    
    return top_clubs.collect()

In [2]:
def get_clubs_with_average_age(spark, X, Y, is_highest=True):
    connection_properties = {
        'user': 'your_username',
        'password': 'your_password',
        'driver': 'org.postgresql.Driver'
    }
    
    df = spark.read.jdbc(
        url='jdbc:postgresql://localhost:5432/your_database',
        table='players',
        properties=connection_properties
    )

    filtered_df = df.filter(F.year(F.col('dob')) == Y)
    
    now = F.current_date()
    aged_df = filtered_df.withColumn('age', F.year(now) - F.year(F.col('dob')))

    clubs_avg_age = aged_df.groupBy('club_name').agg(F.avg('age').alias('avg_age'))

    if X <= 0:
        return []
    
    ordered_clubs = clubs_avg_age.orderBy('avg_age', ascending=not is_highest)
    top_X = ordered_clubs.limit(X).collect()
    
    if len(top_X) == X:
        last_avg_age = top_X[-1]['avg_age']
        additional = ordered_clubs.filter(F.col('avg_age') == last_avg_age).collect()
        top_X.extend([club for club in additional if club not in top_X])

    return top_X

In [3]:
def get_most_popular_nationality(spark):
    connection_properties = {
        'user': 'your_username',
        'password': 'your_password',
        'driver': 'org.postgresql.Driver'
    }
    
    df = spark.read.jdbc(
        url='jdbc:postgresql://localhost:5432/your_database',
        table='players',
        properties=connection_properties
    )

    most_popular_nationalities = {}
    
    for year in range(2015, 2023):
        year_df = df.filter(F.year(F.col('dob')) == year)
        nationality_counts = year_df.groupBy('nationality_name').count()
        most_popular = nationality_counts.orderBy('count', ascending=False).first()
        most_popular_nationalities[year] = most_popular['nationality_name'] if most_popular else None
    
    return most_popular_nationalities

In [4]:
appName = "Big Data Analytics"
master = "local[*]"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .set("spark.drive.memory", "10g")\
    .set("spark.executor.memory", "10g")\
    .setAppName(appName)\
    .setMaster(master)

spark = SparkSession.builder.config(conf = conf).getOrCreate()

24/10/06 19:16:51 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
top_clubs = get_top_clubs_with_contracts_ending(spark, 2020, 5, 2023)
clubs_by_age = get_clubs_with_average_age(spark, 10, 2022, is_highest=True)
popular_nationalities = get_most_popular_nationality(spark)