In [40]:
# Instalação do PySpark
!pip install pyspark



In [41]:
# Importações necessárias
from pyspark.sql import SparkSession
from pyspark.sql.functions import month, year, col
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler, PCA, Normalizer
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [42]:
# Criar a sessão Spark
spark = SparkSession.builder.getOrCreate()

In [44]:
# Montar o Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
# ⬛ Ler o arquivo Parquet
df_video = spark.read.parquet("/content/drive/MyDrive/Colab Notebooks/projeto/videos-comments-tratados-parquet/videos-comments-tratados.snappy.parquet")

In [46]:
# Adicionar coluna 'Month' a partir da coluna "Published At"
df_video = df_video.withColumn("Month", month("Published At"))

In [47]:
# Criação do indexador para a coluna 'keyword'
indexer = StringIndexer(inputCol="Keyword", outputCol="Keyword Index")
df_video = indexer.fit(df_video).transform(df_video)
df_video = df_video.withColumn("Keyword Index", col("Keyword Index").cast("int"))

# Visualiza o resultado
df_video.select("Keyword", "Keyword Index").show()

+-------+-------------+
|Keyword|Keyword Index|
+-------+-------------+
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
|   tech|           17|
+-------+-------------+
only showing top 20 rows



In [48]:
# Garantir que todas as colunas estejam no tipo numérico
df_video = df_video.withColumn("Likes", col("Likes").cast("double"))
df_video = df_video.withColumn("Views", col("Views").cast("double"))
df_video = df_video.withColumn("Year", col("Year").cast("int"))
df_video = df_video.withColumn("Month", col("Month").cast("int"))
df_video = df_video.withColumn("Keyword Index", col("Keyword Index").cast("int"))

# Criar o vetor de features
assembler = VectorAssembler(
    inputCols=["Likes", "Views", "Year", "Month", "Keyword Index"],
    outputCol="Features"
)

# Aplicar transformação no DataFrame
df_video = assembler.transform(df_video)

# Visualizar as colunas finais
df_video.select("Likes", "Views", "Year", "Month", "Keyword Index", "Features").show(truncate=False)

+-------+---------+----+-----+-------------+-----------------------------------+
|Likes  |Views    |Year|Month|Keyword Index|Features                           |
+-------+---------+----+-----+-------------+-----------------------------------+
|3407.0 |135612.0 |2022|8    |17           |[3407.0,135612.0,2022.0,8.0,17.0]  |
|3407.0 |135612.0 |2022|8    |17           |[3407.0,135612.0,2022.0,8.0,17.0]  |
|3407.0 |135612.0 |2022|8    |17           |[3407.0,135612.0,2022.0,8.0,17.0]  |
|3407.0 |135612.0 |2022|8    |17           |[3407.0,135612.0,2022.0,8.0,17.0]  |
|3407.0 |135612.0 |2022|8    |17           |[3407.0,135612.0,2022.0,8.0,17.0]  |
|3407.0 |135612.0 |2022|8    |17           |[3407.0,135612.0,2022.0,8.0,17.0]  |
|3407.0 |135612.0 |2022|8    |17           |[3407.0,135612.0,2022.0,8.0,17.0]  |
|3407.0 |135612.0 |2022|8    |17           |[3407.0,135612.0,2022.0,8.0,17.0]  |
|3407.0 |135612.0 |2022|8    |17           |[3407.0,135612.0,2022.0,8.0,17.0]  |
|3407.0 |135612.0 |2022|8   

In [49]:
# 1. Remover linhas com valores nulos nas colunas
colunas_features = ["Likes", "Views", "Year", "Month", "Keyword Index", "Features"]
df_video = df_video.dropna(subset=colunas_features)

# 2. Criar o normalizador
normalizer = Normalizer(inputCol="Features", outputCol="Features Normal", p=2.0)

# 3. Aplicar a normalização
df_video = normalizer.transform(df_video)

# 4. Visualizar os resultados
df_video.select("Features", "Features Normal").show(truncate=False)

+-----------------------------------+--------------------------------------------------------------------------------------------------------+
|Features                           |Features Normal                                                                                         |
+-----------------------------------+--------------------------------------------------------------------------------------------------------+
|[3407.0,135612.0,2022.0,8.0,17.0]  |[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|

In [50]:
# 1. Remover possíveis nulos na coluna 'Features'
df_video = df_video.dropna(subset=["Features"])

# 2. Aplicar PCA para reduzir de 5 para 1 dimensão
pca = PCA(k=1, inputCol="Features", outputCol="Features PCA")

# 3. Ajustar e transformar os dados
pca_model = pca.fit(df_video)
df_video = pca_model.transform(df_video)

# 4. Visualizar o resultado
df_video.select("Features", "Features PCA").show(truncate=False)

+-----------------------------------+---------------------+
|Features                           |Features PCA         |
+-----------------------------------+---------------------+
|[3407.0,135612.0,2022.0,8.0,17.0]  |[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]  |[-135636.63188203107]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|[-1758667.8498040342]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|[-1758667.8498040342]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|[-1758667.8498040342]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|[-1

In [51]:
# Dividir o DataFrame: 80% treino, 20% teste
df_treino, df_teste = df_video.randomSplit([0.8, 0.2], seed=42)

# Verificar as quantidades
print("Total:", df_video.count())
print("Treino:", df_treino.count())
print("Teste:", df_teste.count())

Total: 18409
Treino: 14789
Teste: 3620


In [52]:
# 1. Garantir que 'Comments' esteja no tipo numérico
df_treino = df_treino.withColumn("Comments", col("Comments").cast("double"))
df_teste = df_teste.withColumn("Comments", col("Comments").cast("double"))

# 2. Criar o modelo de regressão linear
lr = LinearRegression(featuresCol="Features Normal", labelCol="Comments", predictionCol="Prediction")

# 3. Treinar o modelo
modelo_lr = lr.fit(df_treino)

# 4. Aplicar o modelo nos dados de teste
previsoes = modelo_lr.transform(df_teste)

# 5. Avaliar o modelo
avaliador = RegressionEvaluator(labelCol="Comments", predictionCol="Prediction")

rmse = avaliador.setMetricName("rmse").evaluate(previsoes)
mae = avaliador.setMetricName("mae").evaluate(previsoes)
r2 = avaliador.setMetricName("r2").evaluate(previsoes)

# 6. Exibir as métricas
print(f"RMSE (Erro Quadrático Médio): {rmse:.2f}")
print(f"MAE (Erro Absoluto Médio): {mae:.2f}")
print(f"R² (Coeficiente de Determinação): {r2:.2f}")

RMSE (Erro Quadrático Médio): 43345.23
MAE (Erro Absoluto Médio): 11982.36
R² (Coeficiente de Determinação): 0.01


In [53]:
df_video.write.mode("overwrite").parquet("videos-preparados-parquet")

In [56]:
spark.stop()