In [1]:
# Instalar o PySpark
!pip install pyspark



In [26]:
# Importações
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, regexp_extract, to_date, year

In [3]:
# Montar o Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Criação da SparkSession
spark = SparkSession.builder.getOrCreate()

In [10]:
# Leitura do arquivo CSV
df_video = spark.read.csv("drive/MyDrive/Colab Notebooks/projeto/videos-stats.csv", header=True, inferSchema=True)

df_video.show(5)  # Mostra as 5 primeiras linhas
df_video.printSchema()  # Mostra o esquema inferido das colunas

+---+--------------------+-----------+------------+-------+-------+--------+---------+
|_c0|               Title|   Video ID|Published At|Keyword|  Likes|Comments|    Views|
+---+--------------------+-----------+------------+-------+-------+--------+---------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech| 3407.0|   672.0| 135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech|76779.0|  4306.0|1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech|63825.0|  3338.0|1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech|71566.0|  1426.0| 922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech|96513.0|  5155.0|1855644.0|
+---+--------------------+-----------+------------+-------+-------+--------+---------+
only showing top 5 rows

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable =

In [9]:
# Altere os valores nulos dos campos 'Likes', 'Comments' e 'Views' para o valor 0
videos_stats_categ_nulos = videos_stats.na.fill({'Likes': 0, 'Comments': 0, 'Views': 0})

videos_stats_categ_nulos.show()

+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|_c0|               Title|   Video ID|Published At|Keyword|   Likes|Comments|      Views|
+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0|   135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|  1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|  1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0|   922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|  1855644.0|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0|   943119.0|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|  5937790.0|
|  7|I bought the STRA...|2TqOmtTAMRY|  2022-08-07|   tech|216935.0| 12605.0|  4782514.0|
|  8|15 Em

In [11]:
# Leitura do arquivo CSV
df_comentario = spark.read.csv("drive/MyDrive/Colab Notebooks/projeto/comments.csv", header=True, inferSchema=True)

df_comentario.show(5)  # Mostra as 5 primeiras linhas
df_comentario.printSchema()  # Mostra o esquema inferido das colunas

+---+-----------+--------------------+-----+---------+
|_c0|   Video ID|             Comment|Likes|Sentiment|
+---+-----------+--------------------+-----+---------+
|  0|wAZZ-UWGVHI|Let's not forget ...| 95.0|      1.0|
|  1|wAZZ-UWGVHI|Here in NZ 50% of...| 19.0|      0.0|
|  2|wAZZ-UWGVHI|I will forever ac...|161.0|      2.0|
|  3|wAZZ-UWGVHI|Whenever I go to ...|  8.0|      0.0|
|  4|wAZZ-UWGVHI|Apple Pay is so c...| 34.0|      2.0|
+---+-----------+--------------------+-----+---------+
only showing top 5 rows

root
 |-- _c0: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [12]:
# Contagem de registros no DataFrame videos e comentarios
print('Total de registros videos: ', df_video.count())
print('Total de registros videos: ', df_comentario.count())

Total de registros videos:  1881
Total de registros videos:  30036


In [14]:
# Remover registros com valores nulos videos e comentarios
videos_remocao_nulos = df_video.na.drop(subset=['Video ID'])
print('Total de registros após limpeza de valores nulos: ', df_video.count())

comentario_remocao_nulos = df_comentario.na.drop(subset=['Video ID'])
print('Total de registros após limpeza de valores nulos: ', df_comentario.count())

Total de registros após limpeza de valores nulos:  1881
Total de registros após limpeza de valores nulos:  30036


In [16]:
# Remover duplicados videos
videos_unicos = df_video.dropDuplicates(subset=['Video ID'])
print('Total de registros únicos: ', df_video.count())

Total de registros únicos:  1881


In [17]:
# Conversão de colunas no DataFrame de video
registros_tratados_video_df = df_video \
    .withColumn("Likes", col("Likes").cast("int")) \
    .withColumn("Comments", col("Comments").cast("int")) \
    .withColumn("Views", col("Views").cast("int"))

# Exibir os dados transformados
registros_tratados_video_df.show()

# Exibir o schema do DataFrame transformado
registros_tratados_video_df.printSchema()

+---+--------------------+-----------+------------+-------+------+--------+--------+
|_c0|               Title|   Video ID|Published At|Keyword| Likes|Comments|   Views|
+---+--------------------+-----------+------------+-------+------+--------+--------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407|     672|  135612|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779|    4306| 1758063|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825|    3338| 1564007|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566|    1426|  922918|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513|    5155| 1855644|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570|    1643|  943119|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047|    9367| 5937790|
|  7|I bought the STRA...|2TqOmtTAMRY|  2022-08-07|   tech|216935|   12605| 4782514|
|  8|15 Emerging Techn...|wLlL46pYcg4|  2021-12-08|   tech| 45565

In [19]:
# Conversão de colunas no DataFrame de comentario
registros_tratados_comentario_df = df_comentario \
    .withColumn("Likes", col("Likes").cast("int")) \
    .withColumn("Sentiment", col("Sentiment").cast("int")) \
    .withColumnRenamed("Likes", "Likes Comment")

# Exibir os dados transformados
registros_tratados_comentario_df.show()

# Exibir o schema do DataFrame transformado
registros_tratados_comentario_df.printSchema()

+--------------+-----------+--------------------+-------------+---------+
|           _c0|   Video ID|             Comment|Likes Comment|Sentiment|
+--------------+-----------+--------------------+-------------+---------+
|             0|wAZZ-UWGVHI|Let's not forget ...|           95|        1|
|             1|wAZZ-UWGVHI|Here in NZ 50% of...|           19|        0|
|             2|wAZZ-UWGVHI|I will forever ac...|          161|        2|
|             3|wAZZ-UWGVHI|Whenever I go to ...|            8|        0|
|             4|wAZZ-UWGVHI|Apple Pay is so c...|           34|        2|
|             5|wAZZ-UWGVHI|We’ve been houndi...|            8|        1|
|             6|wAZZ-UWGVHI|We only got Apple...|           29|        2|
|             7|wAZZ-UWGVHI|For now, I need b...|            7|        1|
|             8|wAZZ-UWGVHI|In the United Sta...|            2|        2|
|             9|wAZZ-UWGVHI|In Cambodia, we h...|           28|        1|
|            10|b3x28s61q3c|Wow, you r

In [21]:
# Criação de colunas no DataFrame de video
criacao_tratados_video_df = df_video \
    .withColumn("Interaction", col("Likes") + col("Comments") + col("Views"))

# Exibir os dados transformados
criacao_tratados_video_df.show()

# Exibir o schema do DataFrame transformado
criacao_tratados_video_df.printSchema()

+---+--------------------+-----------+------------+-------+--------+--------+-----------+-----------+
|_c0|               Title|   Video ID|Published At|Keyword|   Likes|Comments|      Views|Interaction|
+---+--------------------+-----------+------------+-------+--------+--------+-----------+-----------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0|   135612.0|   139691.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|  1758063.0|  1839148.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|  1564007.0|  1631170.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0|   922918.0|   995910.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|  1855644.0|  1957312.0|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0|   943119.0|   978332.0|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|  593

In [24]:
# Converter a coluna 'Published At' para o tipo date
df_video = df_video.withColumn("Published At", to_date(col("Published At")))

# Verificar o resultado
df_video.select("Published At").show(5)
df_video.printSchema()

+------------+
|Published At|
+------------+
|  2022-08-23|
|  2022-08-24|
|  2022-08-23|
|  2022-08-23|
|  2022-08-08|
+------------+
only showing top 5 rows

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: double (nullable = true)
 |-- Comments: double (nullable = true)
 |-- Views: double (nullable = true)



In [27]:
# Criar a coluna 'Year' extraindo o ano de 'Published At'
df_video = df_video.withColumn("Year", year(col("Published At")))

# Visualizar as colunas para conferir o resultado
df_video.select("Published At", "Year").show(5)

+------------+----+
|Published At|Year|
+------------+----+
|  2022-08-23|2022|
|  2022-08-24|2022|
|  2022-08-23|2022|
|  2022-08-23|2022|
|  2022-08-08|2022|
+------------+----+
only showing top 5 rows



In [31]:
# Realizar o join entre df_video e df_comentario usando o campo 'Video ID'
df_join_video_comments = df_video.join(df_comentario, on="Video ID", how="inner")

# Visualizar algumas colunas para verificar o resultado
df_join_video_comments.show(5, truncate=False)

+-----------+---+--------------------------------------------------------------------------------------------------+------------+-------+------+--------+--------+----+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+---------+
|Video ID   |_c0|Title                                                                                             |Published At|Keyword|Likes |Comments|Views   |Year|_c0|Comment                                                                                                                                                                              

In [32]:
# Leitura do arquivo USvideos.csv com cabeçalho e esquema inferido
df_us_videos = spark.read.csv("drive/MyDrive/Colab Notebooks/projeto/USvideos.csv", header=True, inferSchema=True)

# Exibir as primeiras 5 linhas para verificar
df_us_videos.show(5)

# Verificar o esquema inferido
df_us_videos.printSchema()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            False|           Fal

In [33]:
# Realizar o join usando o campo 'Title'
df_join_video_usvideos = df_video.join(df_us_videos, on="Title", how="inner")

# Visualizar as 5 primeiras linhas de todas as colunas
df_join_video_usvideos.show(5, truncate=False)

print("df_video columns:", df_video.columns)
print("df_us_videos columns:", df_us_videos.columns)

+---------------------------------------+---+-----------+------------+-------+--------+--------+------------+----+-----------+-------------+-------------+-----------+------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+------+--------+-------------+----------------------------------------------+-----------------+----------------+----------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [34]:
# Contar nulos em todas as colunas do df_video
nulos_df_video = df_video.select([
    count(when(col(c).isNull(), c)).alias(c) for c in df_video.columns
])

# Mostrar o resultado
nulos_df_video.show()

+---+-----+--------+------------+-------+-----+--------+-----+----+
|_c0|Title|Video ID|Published At|Keyword|Likes|Comments|Views|Year|
+---+-----+--------+------------+-------+-----+--------+-----+----+
|  0|    0|       0|           0|      0|    2|       2|    2|   0|
+---+-----+--------+------------+-------+-----+--------+-----+----+



In [35]:
# Removendo coluna
df_video = df_video.drop("_c0")

# Salvando DF parquet
df_video.write.mode("overwrite").option("header", True).parquet("videos-tratados-parquet")

In [36]:
# Remover a coluna '_c0' (se existir)
if "_c0" in df_join_video_comments.columns:
    df_join_video_comments = df_join_video_comments.drop("_c0")

# Salvar em formato Parquet com cabeçalho
df_join_video_comments.write.mode("overwrite") \
    .option("header", True) \
    .parquet("drive/MyDrive/Colab Notebooks/projeto/videos-comments-tratados-parquet")

AnalysisException: [COLUMN_ALREADY_EXISTS] The column `likes` already exists. Consider to choose another name or rename the existing column.