In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df_imdb = spark.read.parquet('imdb_parquet/')

In [4]:
(
    df_imdb
    .withColumn('genre',f.split(f.col('genres'),',').getItem(0))
    .groupby('genre')
    .count()
    .orderBy(f.col('count').desc())
    .toPandas()
)

Unnamed: 0,genre,count
0,Comedy,10991
1,Drama,10761
2,Action,7019
3,Documentary,5886
4,Crime,2325
5,Adventure,2125
6,Animation,1759
7,Short,1451
8,Music,1422
9,Reality-TV,1314


In [14]:
(
    df_imdb
    .withColumn('genre',f.split(f.col('genres'),',').getItem(0))
    .groupBy('genre','startYear')
    .agg(f.mean('runtimeMinutes').alias('mean_runtimeMinutes'),)
    .orderBy('startYear', f.col('mean_runtimeMinutes').desc())
    .filter(f.col('startYear') == 2021)
    .toPandas()
)

Unnamed: 0,genre,startYear,mean_runtimeMinutes
0,Sport,2021,128.0
1,Musical,2021,94.0
2,Game-Show,2021,68.428571
3,Talk-Show,2021,64.2
4,Adventure,2021,61.507937
5,\N,2021,61.142857
6,Documentary,2021,58.54902
7,Crime,2021,56.375
8,Biography,2021,55.357143
9,Comedy,2021,51.335329


In [16]:
df_imdb = (
    df_imdb
    .withColumn('genre',f.split(f.col('genres'),',').getItem(0))
)

In [18]:
(
    df_imdb
    .groupby('genre')
    .agg(f.collect_set(f.col('titleType')).alias('lista_tipos_titulo'))
    .orderBy('genre')
    .toPandas()
)

Unnamed: 0,genre,lista_tipos_titulo
0,Action,"[tvSpecial, video, tvEpisode, tvMovie, short, ..."
1,Adult,"[video, tvEpisode, tvMovie, short, videoGame, ..."
2,Adventure,"[tvSpecial, video, tvEpisode, tvMovie, short, ..."
3,Animation,"[tvSpecial, video, tvEpisode, tvMovie, short, ..."
4,Biography,"[tvSpecial, video, tvEpisode, tvMovie, short, ..."
5,Comedy,"[tvSpecial, video, tvEpisode, tvMovie, short, ..."
6,Crime,"[video, tvEpisode, tvMovie, short, videoGame, ..."
7,Documentary,"[tvSpecial, video, tvEpisode, tvMovie, short, ..."
8,Drama,"[tvSpecial, video, tvEpisode, tvMovie, short, ..."
9,Family,"[tvSpecial, video, tvEpisode, tvMovie, short, ..."


In [22]:
(
    df_imdb
    .drop('genre')
    .withColumn('genres', f.explode(f.split(f.col('genres'),',')))
    .groupby('tconst','primaryTitle')
    .pivot('genres')
    .count()
    .na.fill(0)
    .orderBy('tconst')
    .limit(10)
    .toPandas()
)

Unnamed: 0,tconst,primaryTitle,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,...,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,\N
0,tt0002309,Lincoln's Gettysburg Address,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,tt0002434,The Pony Express Girl,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,tt0002437,A Prize Package,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,tt0002582,The Widow Casey's Return,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,tt0006371,Arms and the Woman,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,tt0007254,The Red Widow,0,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
6,tt0007326,Seventeen,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,tt0008528,A Régiséggyüjtö,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8,tt0009851,Masked Ball,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,tt0012297,Hunger... Hunger... Hunger,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
