In [0]:
!pip install python-dotenv

In [0]:
from pyspark.sql.types import StructType, IntegerType, StringType
from pyspark.sql.functions import lit
import pandas as pd
import requests
import time
import threading
from datetime import datetime
from dotenv import load_dotenv
import os

In [0]:
load_dotenv()
CLIENT_ID = os.getenv("CLIENT_ID")
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN")

In [0]:
# Função para obter dados da Twitch
def get_twitch_data():
    headers = {
        "Client-ID": CLIENT_ID,
        "Authorization": f"Bearer {ACCESS_TOKEN}"
    }
    url = 'https://api.twitch.tv/helix/games/top'
    response = requests.get(url, headers=headers)
    data = response.json().get('data', [])
    return pd.DataFrame(data)

# Definir esquema Spark
schema = StructType() \
    .add("id", StringType()) \
    .add("name", StringType()) \
    .add("box_art_url", StringType()) \
    .add("igdb_id", StringType()) 

# Criar tabela Delta com o esquema definido
spark.sql("""
    CREATE TABLE IF NOT EXISTS delta.`/mnt/raw/twitch/streams` (
        id STRING,
        name STRING,
        box_art_url STRING,
        igdb_id STRING,
        ingestion_time STRING
    ) USING DELTA
""")

# Ingestão contínua (streaming-like)
def stream_twitch_data():
    while True:
        pdf = get_twitch_data()
        if not pdf.empty:
            spark_df = spark.createDataFrame(pdf, schema=schema)
            spark_df = spark_df.withColumn("ingestion_time", lit(datetime.now().isoformat()))
            spark_df.write.mode("append").format("delta").save("/mnt/raw/twitch/streams")

            not_games = ["IRL", "Just Chatting", "Music", "Kings League", "Special Events", "Art"]
            filtered_spark_df = spark_df[~spark_df['name'].isin(not_games)]
            filtered_spark_df.write.mode("append").format("delta").save("/mnt/trusted/twitch/streams")
        time.sleep(60)  # A cada 1 minuto

# Iniciar como thread
threading.Thread(target=stream_twitch_data).start()

df_stream = spark.readStream.format("delta").load("/mnt/raw/twitch/streams")

df_stream.writeStream \
    .format("console") \
    .outputMode("append") \
    .start()

In [0]:
display(spark.read.format("delta").load("/mnt/trusted/twitch/streams"))