In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os

In [12]:
# Initialize Spark
spark = SparkSession.builder \
    .appName("RAWG Data Processing") \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv('AWS_ACCESS_KEY_ID')) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv('AWS_SECRET_ACCESS_KEY')) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

spark

In [13]:
# Read data from S3
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("s3a://rawg-pyspark/raw/games/games_2024_08.csv")

df.show()

+------+------------------------------------+----------+------+-------------+----------+--------+----------------+--------------------+--------------------+----------+
|    id|                                name|  released|rating|ratings_count|metacritic|playtime|       platforms|                slug|    background_image|rating_top|
+------+------------------------------------+----------+------+-------------+----------+--------+----------------+--------------------+--------------------+----------+
|988742|                3D PUZZLE - Winte...|2024-09-11|  NULL|         NULL|      NULL|    NULL|PC, macOS, Linux|3d-puzzle-winter-...|https://media.raw...|      NULL|
|988756|                Doner Master Simu...|2024-09-11|  NULL|         NULL|      NULL|    NULL|              PC|doner-master-simu...|https://media.raw...|      NULL|
|988761|                3D PUZZLE - Pizza...|2024-09-11|  NULL|         NULL|      NULL|    NULL|PC, macOS, Linux|3d-puzzle-pizza-s...|https://media.raw...|    

In [14]:
# Apply transformations
transformed_df = df \
    .filter((col("metacritic").isNotNull()) | (col("playtime") > 0) | (col("ratings_count") > 10)) \
    .withColumn("year", year("released")) \
    .withColumn("month", month("released")) \
    .withColumn("platform_count", size(split("platforms", ",")))

transformed_df.show()

+------+--------------------+----------+------+-------------+----------+--------+---------+--------------------+--------------------+----------+----+-----+--------------+
|    id|                name|  released|rating|ratings_count|metacritic|playtime|platforms|                slug|    background_image|rating_top|year|month|platform_count|
+------+--------------------+----------+------+-------------+----------+--------+---------+--------------------+--------------------+----------+----+-----+--------------+
|988776|MARVEL vs. CAPCOM...|2024-09-11|  NULL|         NULL|      NULL|     5.0|       PC|marvel-vs-capcom-...|https://media.raw...|      NULL|2024|    9|             1|
| 58806|        Satisfactory|2024-09-11|  4.29|        289.0|      NULL|    12.0|       PC|        satisfactory|https://media.raw...|       5.0|2024|    9|             1|
+------+--------------------+----------+------+-------------+----------+--------+---------+--------------------+--------------------+----------+-

In [None]:
 Write transformed data back to S3
transformed_df.write.mode("overwrite").parquet("s3a://your-bucket-name/transformed_data/games_2024_08/")