In [9]:
from google.cloud import storage
import pyarrow.parquet as pq
import pyarrow as pa

GCP_PROJECT_ID = 'data-bootcamp-efreitas'
GCP_GCS_BUCKET = "dtc_data_lake_data-bootcamp-efreitas"
BLOB_NAME = "raw/steam_games_dataset/games.parquet" 
storage_client = storage.Client()
bucket = storage_client.bucket(GCP_GCS_BUCKET)
blob = bucket.blob(BLOB_NAME)
blob.download_to_filename("games.parquet")


In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [46]:
from pyspark.sql import types
games_schema = types.StructType([
    types.StructField("AppID", types.IntegerType(), True),
    types.StructField("Name", types.StringType(), True),
    types.StructField("Release date", types.DateType(), True),
    types.StructField("Estimated owners", types.StringType(), True),
    types.StructField("Peak CCU", types.IntegerType(), True),
    types.StructField("Required age", types.IntegerType(), True),
    types.StructField("Price", types.DoubleType(), True),
    types.StructField("DLC count", types.IntegerType(), True),
    types.StructField("About the game", types.StringType(), True),
    types.StructField("Supported language", types.StringType(), True),
])

In [54]:
games = spark.read.option("header", "true").parquet('./games.parquet')

In [95]:
from pyspark.sql.functions import *

In [96]:
games.registerTempTable('steam_data')

In [97]:
games.columns

['AppID',
 'Name',
 'Release date',
 'Estimated owners',
 'Peak CCU',
 'Required age',
 'Price',
 'DLC count',
 'About the game',
 'Supported languages',
 'Full audio languages',
 'Reviews',
 'Header image',
 'Website',
 'Support url',
 'Support email',
 'Windows',
 'Mac',
 'Linux',
 'Metacritic score',
 'Metacritic url',
 'User score',
 'Positive',
 'Negative',
 'Score rank',
 'Achievements',
 'Recommendations',
 'Notes',
 'Average playtime forever',
 'Average playtime two weeks',
 'Median playtime forever',
 'Median playtime two weeks',
 'Developers',
 'Publishers',
 'Categories',
 'Genres',
 'Tags',
 'Screenshots',
 'Movies']

In [98]:
games.printSchema()

root
 |-- AppID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Release date: string (nullable = true)
 |-- Estimated owners: string (nullable = true)
 |-- Peak CCU: long (nullable = true)
 |-- Required age: long (nullable = true)
 |-- Price: double (nullable = true)
 |-- DLC count: long (nullable = true)
 |-- About the game: string (nullable = true)
 |-- Supported languages: string (nullable = true)
 |-- Full audio languages: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Header image: string (nullable = true)
 |-- Website: string (nullable = true)
 |-- Support url: string (nullable = true)
 |-- Support email: string (nullable = true)
 |-- Windows: boolean (nullable = true)
 |-- Mac: boolean (nullable = true)
 |-- Linux: boolean (nullable = true)
 |-- Metacritic score: long (nullable = true)
 |-- Metacritic url: string (nullable = true)
 |-- User score: long (nullable = true)
 |-- Positive: long (nullable = true)
 |-- Negative: long (nullable = t

In [124]:
games.select(col("Release date"), 
    to_date(col("Release date"), "MMM d, yyyy").alias("to_date") 
  ).show()

+------------+----------+
|Release date|   to_date|
+------------+----------+
|Oct 21, 2008|2008-10-21|
|Oct 12, 2017|2017-10-12|
|Nov 17, 2021|2021-11-17|
|Jul 23, 2020|2020-07-23|
| Feb 3, 2020|2020-02-03|
|Feb 26, 2021|2021-02-26|
| Jan 9, 2022|2022-01-09|
| May 5, 2022|2022-05-05|
| Apr 2, 2020|2020-04-02|
|Nov 11, 2014|2014-11-11|
| Oct 2, 2019|2019-10-02|
| Jul 1, 2016|2016-07-01|
|Aug 27, 2021|2021-08-27|
| Apr 6, 2018|2018-04-06|
|Nov 25, 2020|2020-11-25|
|Nov 17, 2010|2010-11-17|
| Nov 6, 2020|2020-11-06|
|Feb 20, 2015|2015-02-20|
|Jun 20, 2019|2019-06-20|
|Jul 24, 2020|2020-07-24|
+------------+----------+
only showing top 20 rows



In [125]:
spark.sql("""
SELECT
    Name,
    to_date(`Release date`, 'MMM d, yyyy'),
    `Estimated owners`,
    `Peak CCU`,
    Price
FROM
    steam_data
""").show()

+--------------------+----------------------------------+----------------+--------+-----+
|                Name|to_date(Release date, MMM d, yyyy)|Estimated owners|Peak CCU|Price|
+--------------------+----------------------------------+----------------+--------+-----+
|    Galactic Bowling|                        2008-10-21|       0 - 20000|       0|19.99|
|        Train Bandit|                        2017-10-12|       0 - 20000|       0| 0.99|
|        Jolt Project|                        2021-11-17|       0 - 20000|       0| 4.99|
|            Henosis™|                        2020-07-23|       0 - 20000|       0| 5.99|
|Two Weeks in Pain...|                        2020-02-03|       0 - 20000|       0|  0.0|
|      Wartune Reborn|                        2021-02-26|  50000 - 100000|      68|  0.0|
|           TD Worlds|                        2022-01-09|       0 - 20000|       3|10.99|
|Legend of Rome - ...|                        2022-05-05|       0 - 20000|       2| 9.99|
|MazM: Jek

In [155]:
games_aggregation = spark.sql("""
SELECT
    year(to_date(`Release date`, "MMM d, yyyy")) as year,
    COUNT(NAME) as releases,
    ROUND(SUM(Price),2) as total_price,
    ROUND(AVG(Price),2) as avg_price,
    ROUND(MAX(Price), 2) as max_price,
    ROUND(MIN(Price), 2) as min_price,
    ROUND(AVG(`Metacritic score`), 2) as avg_score
FROM
    steam_data
WHERE Price > 0
GROUP BY year(to_date(`Release date`, "MMM d, yyyy"))
ORDER BY year(to_date(`Release date`, "MMM d, yyyy")) DESC;
""")

In [166]:
games_aggregation.write.parquet('games_aggregation.parquet', 'overwrite')

In [174]:
import glob
source_file_name = glob.glob('./games_aggregation.parquet/' + '*.parquet')[0]
print(source_file_name)

./games_aggregation.parquet/part-00000-9d0b6f1c-f1b8-42ba-8714-0c42f8dd98c9-c000.snappy.parquet


In [176]:
from google.cloud import storage
import pyarrow.parquet as pq
import pyarrow as pa

GCP_PROJECT_ID = 'data-bootcamp-efreitas'
GCP_GCS_BUCKET = "dtc_data_lake_data-bootcamp-efreitas"
DESTINATION_BLOB_NAME = "processed/steam_games_dataset/games_aggregation.parquet" 
storage_client = storage.Client()
bucket = storage_client.bucket(GCP_GCS_BUCKET)
blob = bucket.blob(DESTINATION_BLOB_NAME)
blob.upload_from_filename(source_file_name)


In [4]:
from google.cloud import bigquery

client = bigquery.Client()
STEAM_ANALYSIS = 'steam_analysis'
TABLE_AGGREGATION = 'games_aggregation'
BUCKET = 'gs://dtc_data_lake_data-bootcamp-efreitas/processed/steam_games_dataset/games_aggregation.parquet'
dataset_ref = client.dataset(STEAM_ANALYSIS)
table_ref = bigquery.TableReference(dataset_ref, TABLE_AGGREGATION)
table = bigquery.Table(table_ref)

external_config = bigquery.ExternalConfig('PARQUET')
source_uris = [BUCKET] #i.e for a csv file in a Cloud Storage bucket 
                                              #it would be something like "gs://<your-bucket>/<your-csv-file>"
external_config.source_uris = source_uris
external_config.autodetect = True
table.external_data_configuration = external_config

client.create_table(table)

Table(TableReference(DatasetReference('data-bootcamp-efreitas', 'steam_analysis'), 'games_aggregation'))