### Spark Agregate Functions

### Funciones simples de agregación

In [0]:
%run "../includes/configuration"

In [0]:
movies_df = spark.read.parquet(f'{silver_folder_path}/movies')

In [0]:
from pyspark.sql.functions import count, countDistinct, sum

In [0]:
movies_df.select(count('*')).show()

In [0]:
movies_df.select(count('year_release_date')).show()

In [0]:
movies_df.select(countDistinct('year_release_date')).show()

In [0]:
movies_df.select(sum('budget')).display()

In [0]:
movies_df.filter('year_release_date = 2016') \
    .select(count('budget'), count('movie_id')) \
    .withColumnRenamed('sum(budget)', 'total_budget') \
    .withColumnRenamed('count(movie_id)', 'count_movies') \
    .display()

### Group By

In [0]:
from pyspark.sql.functions import max, min, avg

In [0]:
movie_grup_by_df = movies_df \
    .groupBy('year_release_date') \
    .agg(
        sum('budget').alias('total_budget'),
        avg('budget').alias('avg_budget'),
        max('budget').alias('max_budget'),
        min('budget').alias('min_budget'),
        count('movie_id').alias('count_movies')
    )

In [0]:
display(movie_grup_by_df)

### Window Functions

In [0]:
from pyspark.sql.functions import rank, desc, dense_rank
from pyspark.sql.window import Window

In [0]:
movie_rank = Window.partitionBy('year_release_date').orderBy(desc('budget'))
movie_dense_rank = Window.partitionBy('year_release_date').orderBy(desc('budget'))

movies_df.select('title', 'budget', 'year_release_date') \
    .filter('year_release_date is not null') \
    .withColumn('rank', rank().over(movie_rank)) \
    .withColumn('dense_rank', dense_rank().over(movie_dense_rank)) \
    .display()