In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os

In [16]:
# Initialize Spark
spark = SparkSession.builder \
    .appName("RAWG Data Processing") \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv('AWS_ACCESS_KEY_ID')) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv('AWS_SECRET_ACCESS_KEY')) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

spark

In [17]:
# Read data from S3
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("s3a://rawg-pyspark/raw/games/games_2024_06_12.csv")

df.show()

+------+--------------------+----------+------+-------------+----------+--------+--------------------+--------------------+--------------------+----------+
|    id|                name|  released|rating|ratings_count|metacritic|playtime|           platforms|                slug|    background_image|rating_top|
+------+--------------------+----------+------+-------------+----------+--------+--------------------+--------------------+--------------------+----------+
| 58806|        Satisfactory|2024-09-11|  4.29|          289|      NULL|    12.0|                  PC|        satisfactory|https://media.raw...|       5.0|
|303576|Vampire: The Masq...|2024-11-30|  3.89|          255|      NULL|   329.0|PC, PlayStation 5...|vampire-the-masqu...|https://media.raw...|       5.0|
| 58386|S.T.A.L.K.E.R. 2:...|2024-11-20|  3.81|          197|      NULL|     3.0| PC, Xbox Series S/X|           stalker-2|https://media.raw...|       5.0|
|616688|         Core Keeper|2024-08-26|   4.0|           63|   

In [18]:
# Apply transformations
transformed_df = df \
    .filter((col("metacritic").isNotNull()) | (col("playtime") > 0) | (col("ratings_count") > 10)) \
    .withColumn("year", year("released")) \
    .withColumn("month", month("released")) \
    .withColumn("platform_count", size(split("platforms", ",")))

transformed_df.show()

+------+--------------------+----------+------+-------------+----------+--------+--------------------+--------------------+--------------------+----------+----+-----+--------------+
|    id|                name|  released|rating|ratings_count|metacritic|playtime|           platforms|                slug|    background_image|rating_top|year|month|platform_count|
+------+--------------------+----------+------+-------------+----------+--------+--------------------+--------------------+--------------------+----------+----+-----+--------------+
| 58806|        Satisfactory|2024-09-11|  4.29|          289|      NULL|    12.0|                  PC|        satisfactory|https://media.raw...|       5.0|2024|    9|             1|
|303576|Vampire: The Masq...|2024-11-30|  3.89|          255|      NULL|   329.0|PC, PlayStation 5...|vampire-the-masqu...|https://media.raw...|       5.0|2024|   11|             5|
| 58386|S.T.A.L.K.E.R. 2:...|2024-11-20|  3.81|          197|      NULL|     3.0| PC, Xbox

In [20]:
#Write transformed data back to S3
transformed_df.write.mode("overwrite").parquet("s3a://rawg-pyspark/transformed_data/games_2024_06_12/")