In [1]:
# Instala o PySpark
!pip install pyspark



In [29]:
# Importações
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [3]:
# Monta o Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Cria a sessão Spark
spark = SparkSession.builder.getOrCreate()

In [7]:
# Leitura do arquivo Parquet
df_video = spark.read.parquet("/content/drive/MyDrive/Colab Notebooks/projeto/videos-comments-tratados-parquet/videos-preparados.snappy.parquet")

# Exibir as primeiras linhas do DataFrame
df_video.show(5)

+--------------------+-----------+------------+-------+------+--------+--------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|               Title|   Video ID|Published At|Keyword| Likes|Comments|   Views|Interaction|Year|Month|Keyword Index|        Features PCA|     Features Normal|            Features|
+--------------------+-----------+------------+-------+------+--------+--------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|mukbang|378858|   18860|17975269|   18372987|2020|    4|         30.0|[0.6985786560867407]|[0.02303716158264...|[378858.0,1.79752...|
|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|   news|  6379|    4853|  808787|     820019|2022|    8|         37.0|[0.8936407990235931]|[3.87946679100418...|[6379.0,808787.0,...|
|How Biden&#39;s s...|--ixiTypG8g|  2022-08-24|   news|  1029|    2347|   97434|     100810|202

In [9]:
# Agrupar por 'Keyword' e contar a quantidade de registros
df_keyword_count = df_video.groupBy("Keyword").agg(count("*").alias("quantidade"))

# Exibir o resultado ordenado pela quantidade
df_keyword_count.orderBy("quantidade", ascending=False).show()

+----------------+----------+
|         Keyword|quantidade|
+----------------+----------+
|             cnn|        50|
|       interview|        50|
|          crypto|        50|
|    data science|        50|
|        trolling|        50|
|        tutorial|        50|
|          marvel|        50|
|game development|        50|
|         mrbeast|        50|
|         physics|        50|
|             sat|        49|
|         history|        49|
|           cubes|        49|
|        reaction|        49|
|          sports|        49|
|            asmr|        49|
|computer science|        48|
|            food|        48|
|          how-to|        48|
|machine learning|        48|
+----------------+----------+
only showing top 20 rows



In [12]:
# Média da coluna "Interaction" para cada valor único da coluna 'Keyword'
df_keyword_media = df_video.groupBy("Keyword") \
    .agg(
        count("*").alias("quantidade"),
        format_number(avg("Interaction"), 2).alias("media_interaction")
    )

df_keyword_media.orderBy("quantidade", ascending=False).show()

+----------------+----------+-----------------+
|         Keyword|quantidade|media_interaction|
+----------------+----------+-----------------+
|             cnn|        50|       570,650.86|
|       interview|        50|     3,044,867.04|
|          crypto|        50|       413,676.20|
|    data science|        50|       562,465.28|
|        trolling|        50|     1,484,584.88|
|        tutorial|        50|     6,936,688.30|
|          marvel|        50|     6,834,159.44|
|game development|        50|       752,243.56|
|         mrbeast|        50|    68,965,862.82|
|         physics|        50|     3,795,529.38|
|             sat|        49|     1,098,927.00|
|         history|        49|    15,652,692.57|
|           cubes|        49|    15,043,961.22|
|        reaction|        49|       164,723.57|
|          sports|        49|     8,695,551.63|
|            asmr|        49|     1,779,749.69|
|computer science|        48|     1,226,793.02|
|            food|        48|     5,352,

In [20]:
# Máximo da coluna "Interaction" para cada valor único da coluna 'Keyword'
df_keyword_maximo = df_video.groupBy("Keyword") \
    .agg(
        count("*").alias("quantidade"),
        max("Interaction").alias("Rank interaction")
    )

df_keyword_maximo = df_keyword_maximo.orderBy("Rank interaction", ascending=False).show()

+--------+----------+----------------+
| Keyword|quantidade|Rank interaction|
+--------+----------+----------------+
| animals|        38|      1593623628|
|   music|        46|       922551152|
|     bed|        44|       532691631|
| history|        49|       440187490|
|   apple|        42|       429916936|
| mrbeast|        50|       300397699|
|  google|        45|       239385460|
|business|        48|       210025196|
|   cubes|        49|       170925917|
|  sports|        49|       106924567|
| mukbang|        45|        87433858|
|    lofi|        40|        86445177|
|tutorial|        50|        69616442|
|  movies|        44|        65253870|
|  marvel|        50|        56247330|
|  how-to|        48|        53053975|
|    food|        48|        48754479|
| physics|        50|        43463298|
|    asmr|        49|        34411125|
|nintendo|        48|        32268486|
+--------+----------+----------------+
only showing top 20 rows



In [21]:
# Média e a variância da coluna 'Views' para cada valor único da coluna 'Keyword'
df_keyword_med_var = df_video.groupBy("Keyword") \
    .agg(
        count("*").alias("quantidade"),
        format_number(avg("Views"), 2).alias("media_views"),
        format_number(var_samp("Views"), 2).alias("variancia_views")
    )
df_keyword_med_var = df_keyword_med_var.orderBy("quantidade", ascending=False).show()

+----------------+----------+-------------+--------------------+
|         Keyword|quantidade|  media_views|     variancia_views|
+----------------+----------+-------------+--------------------+
|             cnn|        50|   554,240.38|  156,342,361,846.81|
|       interview|        50| 2,966,111.86|18,192,209,960,34...|
|          crypto|        50|   404,608.22|3,513,691,634,369.07|
|    data science|        50|   544,771.98|  547,933,652,535.00|
|        trolling|        50| 1,420,141.02|6,932,651,793,973.29|
|        tutorial|        50| 6,761,032.02|136,962,659,686,4...|
|          marvel|        50| 6,614,079.56|144,026,680,526,2...|
|game development|        50|   724,688.54|1,276,998,225,613.15|
|         mrbeast|        50|66,764,003.98|3,824,123,679,605...|
|         physics|        50| 3,692,387.28|47,414,096,705,62...|
|             sat|        49| 1,065,868.71|8,285,094,966,049.21|
|         history|        49|15,353,155.53|4,253,204,661,918...|
|           cubes|       

In [22]:
# Média, o valor mínimo e o valor máximo de 'Views' para cada valor único da coluna 'Keyword', sem casas decimais
df_keyword_views_casas_decimais = df_video.groupBy("Keyword") \
    .agg(
        count("*").alias("quantidade"),
        round(avg("Views")).alias("media_views"),
        round(min("Views")).alias("min_views"),
        round(max("Views")).alias("max_views")
    )

df_keyword_views_casas_decimais.orderBy("quantidade", ascending=False).show()

+----------------+----------+-----------+---------+---------+
|         Keyword|quantidade|media_views|min_views|max_views|
+----------------+----------+-----------+---------+---------+
|             cnn|        50|   554240.0|    51269|  1889320|
|       interview|        50|  2966112.0|     2587| 22529756|
|          crypto|        50|   404608.0|     1599| 11805668|
|    data science|        50|   544772.0|      911|  3069097|
|        trolling|        50|  1420141.0|     5388| 14286302|
|        tutorial|        50|  6761032.0|    19323| 68512549|
|          marvel|        50|  6614080.0|     2813| 54583132|
|game development|        50|   724689.0|     1352|  6478696|
|         mrbeast|        50|6.6764004E7|   889300|285526909|
|         physics|        50|  3692387.0|    30388| 42252029|
|             sat|        49|  1065869.0|     7163| 18116954|
|         history|        49|1.5353156E7|     6640|434352213|
|           cubes|        49|1.4735344E7|    10146|168546247|
|       

In [24]:
# Mostre o primeiro e o último 'Published At' para cada valor único da coluna 'Keyword'
df_keyword_published = df_video.groupBy("Keyword") \
    .agg(
        min("Published At").alias("primeira_publicacao"),
        max("Published At").alias("ultima_publicacao")
    )

df_keyword_published.show(truncate=False)

+----------------+-------------------+-----------------+
|Keyword         |primeira_publicacao|ultima_publicacao|
+----------------+-------------------+-----------------+
|computer science|2009-08-20         |2022-08-12       |
|lofi            |2019-12-08         |2022-08-24       |
|finance         |2012-11-27         |2022-08-24       |
|cnn             |2022-07-14         |2022-08-24       |
|apple           |2016-11-02         |2022-08-24       |
|news            |2022-08-18         |2022-08-24       |
|mukbang         |2020-02-29         |2022-08-24       |
|education       |2008-07-25         |2022-08-24       |
|interview       |2016-01-05         |2022-08-24       |
|crypto          |2022-03-11         |2022-08-24       |
|mathchemistry   |2013-04-15         |2022-05-03       |
|food            |2017-05-31         |2022-08-24       |
|data science    |2018-06-23         |2022-08-24       |
|trolling        |2020-06-14         |2022-08-24       |
|tutorial        |2017-02-01   

In [25]:
# Conte todos os 'title' de forma normal e todos os únicos e verifique se há diferença
df_video.select(
    count("title").alias("total_titles"),
    countDistinct("title").alias("titulos_unicos")
).show()

+------------+--------------+
|total_titles|titulos_unicos|
+------------+--------------+
|        1869|          1854|
+------------+--------------+



In [26]:
# Mostre a quantidade de registros ordenados por ano em ordem ascendente
df_por_ano = df_video.withColumn("Ano", year("Published At")) \
    .groupBy("Ano") \
    .agg(count("*").alias("quantidade")) \
    .orderBy("Ano", ascending=True)

df_por_ano.show()

+----+----------+
| Ano|quantidade|
+----+----------+
|2007|         2|
|2008|         1|
|2009|         9|
|2010|         6|
|2011|         4|
|2012|        12|
|2013|         6|
|2014|        10|
|2015|        15|
|2016|        34|
|2017|        47|
|2018|        57|
|2019|        86|
|2020|       158|
|2021|       229|
|2022|      1193|
+----+----------+



In [27]:
# Mostre a quantidade de registros ordenados por ano e mês em ordem ascendente
df_por_ano_mes = df_video.withColumn("Ano", year("Published At")) \
    .withColumn("Mes", month("Published At")) \
    .groupBy("Ano", "Mes") \
    .agg(count("*").alias("quantidade")) \
    .orderBy("Ano", "Mes")

df_por_ano_mes.show()

+----+---+----------+
| Ano|Mes|quantidade|
+----+---+----------+
|2007|  7|         1|
|2007| 12|         1|
|2008|  7|         1|
|2009|  2|         2|
|2009|  6|         2|
|2009|  7|         1|
|2009|  8|         1|
|2009| 10|         1|
|2009| 12|         2|
|2010|  3|         1|
|2010|  5|         2|
|2010|  6|         1|
|2010|  9|         1|
|2010| 10|         1|
|2011|  2|         1|
|2011|  5|         1|
|2011|  9|         1|
|2011| 10|         1|
|2012|  1|         1|
|2012|  2|         3|
+----+---+----------+
only showing top 20 rows



In [30]:
# Calcule a média acumulativa de ‘Likes’ para cada ‘Keyword’ ao longo dos anos

# Criar uma coluna com o ano
df_com_ano = df_video.withColumn("Ano", year("Published At"))

# Definir a janela de tempo por Keyword e ordem crescente do ano
janela = Window.partitionBy("Keyword").orderBy("Ano") \
               .rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Calcular a média acumulativa
df_media_acumulativa = df_com_ano.groupBy("Keyword", "Ano") \
    .agg(avg("Likes").alias("media_likes_ano")) \
    .withColumn("media_likes_acumulada", avg("media_likes_ano").over(janela)) \
    .orderBy("Keyword", "Ano")

df_media_acumulativa.show()

+-------+----+------------------+---------------------+
|Keyword| Ano|   media_likes_ano|media_likes_acumulada|
+-------+----+------------------+---------------------+
|animals|2009|         1357197.0|            1357197.0|
|animals|2010|          203367.0|             780282.0|
|animals|2013|       1.1025176E7|    4195246.666666667|
|animals|2014|         3381630.0|            3991842.5|
|animals|2019|         1103713.0|            3414216.6|
|animals|2020| 769652.1111111111|    2973455.851851852|
|animals|2021|         112729.75|   2564780.6944444445|
|animals|2022|30335.214285714286|    2247975.009424603|
|  apple|2016|         4144389.0|            4144389.0|
|  apple|2021|           38261.0|            2091325.0|
|  apple|2022|           19416.6|   1400688.8666666665|
|   asmr|2020|          148120.0|             148120.0|
|   asmr|2021| 363124.3333333333|   255622.16666666666|
|   asmr|2022| 13171.31111111111|   174805.21481481483|
|    bed|2007|          317160.5|             31