In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os

In [2]:
# Initialize Spark
spark = SparkSession.builder \
    .appName("RAWG Data Processing") \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv('AWS_ACCESS_KEY_ID')) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv('AWS_SECRET_ACCESS_KEY')) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

spark

In [8]:
# Read data from S3
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("s3a://rawg-pyspark/raw/games/games_2024_08.csv")

df.show()

+------+-------------------------------------+----------+------+-------------+----------+--------+----------------+--------------------+--------------------+----------+
|    id|                                 name|  released|rating|ratings_count|metacritic|playtime|       platforms|                slug|    background_image|rating_top|
+------+-------------------------------------+----------+------+-------------+----------+--------+----------------+--------------------+--------------------+----------+
|988108|                 World Without Reason|2024-08-31|  NULL|         NULL|      NULL|    NULL|PC, macOS, Linux|world-without-reason|https://media.raw...|      NULL|
|988238|                             Slutlike|2024-08-31|  NULL|         NULL|      NULL|    NULL|              PC|            slutlike|https://media.raw...|      NULL|
|988228|                 Canfield Solitair...|2024-08-31|  NULL|         NULL|      NULL|    NULL|              PC|canfield-solitair...|https://media.raw..

In [10]:
# Apply transformations
transformed_df = df \
    .filter((col("metacritic").isNotNull()) | (col("playtime") > 0) | (col("ratings_count") > 10)) \
    .withColumn("year", year("released")) \
    .withColumn("month", month("released")) \
    .withColumn("platform_count", size(split("platforms", ",")))

transformed_df.show()

+---+----+--------+------+-------------+----------+--------+---------+----+----------------+----------+----+-----+--------------+
| id|name|released|rating|ratings_count|metacritic|playtime|platforms|slug|background_image|rating_top|year|month|platform_count|
+---+----+--------+------+-------------+----------+--------+---------+----+----------------+----------+----+-----+--------------+
+---+----+--------+------+-------------+----------+--------+---------+----+----------------+----------+----+-----+--------------+



In [None]:
 Write transformed data back to S3
transformed_df.write.mode("overwrite").parquet("s3a://your-bucket-name/transformed_data/games_2024_08/")