In [1]:
import asyncio
import rawg
import pandas as pd
import boto3
import os

In [2]:
# Initialize S3 client -- this is where we utilize our .env AWS Creds
s3 = boto3.client('s3',
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
    region_name=os.getenv('AWS_REGION')
)

In [3]:
# Create bucket
bucket_name = 'rawg-pyspark'

try:
    s3.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={
            'LocationConstraint': os.getenv('AWS_REGION')
        }
    )
    print(f"Successfully created bucket: {bucket_name}")
    
    # Set up bucket folders/prefixes
    folders = [
        'raw/games/',      # Raw data from RAWG API
        'processed/games/' # Transformed data
    ]
    
    for folder in folders:
        s3.put_object(Bucket=bucket_name, Key=folder)
        print(f"Created folder: {folder}")
        
except s3.exceptions.BucketAlreadyOwnedByYou:
    print(f"Bucket {bucket_name} already exists")
except Exception as e:
    print(f"Error: {e}")

Bucket rawg-pyspark already exists


In [8]:
# Per the rawg PyPi documentation, async is one way to make multiple API calls efficiently

# we are grabbing the top 200 rated games per time period to manage data size / storage
# in PROD, we would consider all data for analysis
async def fetch_games_for_date_range(api, start_date, end_date, page_size=40, max_pages=5):
    all_games = []
    page = 1
    
    try:
        while page <= max_pages:
            games = await api.games_list(
                dates=f"{start_date},{end_date}",
                ordering='-ratings',
                page_size=page_size,
                page=page
            )
            
            if not games.results:
                break
                
            for game in games.results:
                platforms = [p['platform']['name'] for p in game.to_dict()['platforms']] if game.platforms else []
                
                game_data = {
                    'id': game.id,
                    'name': game.name,
                    'released': game.released,
                    'rating': float(game.rating) if game.rating else None,
                    'ratings_count': int(game.ratings_count) if game.ratings_count else None,
                    'metacritic': int(game.metacritic) if game.metacritic else None,
                    'playtime': int(game.playtime) if game.playtime else None,
                    'platforms': ', '.join(platforms),
                    'slug': game.slug,
                    'background_image': game.background_image,
                    'rating_top': int(game.rating_top) if game.rating_top else None
                }
                all_games.append(game_data)
            
            page += 1
            
        return all_games
        
    except Exception as e:
        print(f"Error fetching data for {start_date} to {end_date}: {e}")
        return []

In [9]:
# Define the function to fetch all games 
# passing date ranges for multiple API requests we are planning to make
async def fetch_all_games():
    # we are interested in these ranges
    # will asynchronously fetch data from the RAWG API per each range
    date_ranges = [
        ('2024-01-01', '2024-03-31'),
        ('2024-04-01', '2024-06-30'),
        ('2024-07-01', '2024-09-30'),
        ('2024-10-01', '2024-12-31')
    ]
    
    all_games = []
    async with rawg.ApiClient(
        rawg.Configuration(
            api_key={'key': os.getenv('RAWG_API_KEY')}
        )
    ) as api_client:
        api = rawg.GamesApi(api_client)
        
        # Create tasks for all date ranges
        tasks = [fetch_games_for_date_range(api, start, end) 
                for start, end in date_ranges]
        
        # Execute all tasks concurrently
        results = await asyncio.gather(*tasks)
        
        # Combine results
        for games in results:
            all_games.extend(games)
    
    return pd.DataFrame(all_games)

In [10]:
# actually fetch the data now
df_games = await fetch_all_games()
print(f"Total games fetched: {len(df_games)}")

Total games fetched: 800


In [13]:
# Save to CSV and upload to S3
csv_buffer = df_games.to_csv(index=False)

response = s3.put_object(Bucket='rawg-pyspark', Key='raw/games/games_2024.csv', Body=csv_buffer)

# Print only necessary information
print(f"Upload status: {response['ResponseMetadata']['HTTPStatusCode']}")

Upload status: 200
