In [None]:
#Imports and Setup
from rawg.api import RAWG
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
from datetime import datetime
import os
from time import sleep

In [None]:
# Initialize RAWG client and Spark
rawg_client = RAWG(os.getenv('RAWG_API_KEY'))

spark = SparkSession.builder \
    .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.demo.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
    .config("spark.sql.catalog.demo.uri", "http://rest:8181") \
    .config("spark.sql.catalog.demo.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.demo.warehouse", "s3://rawg-video-game-sales/") \
    .config("spark.sql.catalog.demo.s3.endpoint", "http://minio:9000") \
    .getOrCreate()

In [None]:
# Extract data
def fetch_games_with_rawg(start_date, end_date):
    games_list = []
    page = 1
    
    while True:
        games = rawg_client.get_games(
            dates=f'{start_date},{end_date}',
            page=page,
            page_size=40,
            ordering='-released'
        )
        
        if not games['results']:
            break
            
        for game in games['results']:
            game_data = {
                'id': game.get('id'),
                'name': game.get('name'),
                'released': game.get('released'),
                'rating': game.get('rating'),
                'ratings_count': game.get('ratings_count'),
                'metacritic': game.get('metacritic'),
                'playtime': game.get('playtime'),
                'platforms': [p['platform']['name'] for p in game.get('platforms', [])],
                'genres': [g['name'] for g in game.get('genres', [])],
                'tags': [t['name'] for t in game.get('tags', [])][:5],
                'esrb_rating': game.get('esrb_rating', {}).get('name'),
                'background_image': game.get('background_image')
            }
            games_list.append(game_data)
        
        print(f"Processed page {page}, found {len(games['results'])} games")
        page += 1
        sleep(0.5)
    
    return pd.DataFrame(games_list)

# Get the data
df_games = fetch_games_with_rawg('2024-01-01', '2024-06-30')

# Clean list columns
df_games['platforms'] = df_games['platforms'].apply(lambda x: ', '.join(x))
df_games['genres'] = df_games['genres'].apply(lambda x: ', '.join(x))
df_games['tags'] = df_games['tags'].apply(lambda x: ', '.join(x))

# Show the pandas DataFrame
print("Sample of raw data:")
display(df_games.head())