In [None]:
import pandas as pd
from pyspark.sql import functions as F

steam = spark.read.json('s3://full-stack-bigdata-datasets/Big_Data/Project_Steam/steam_game_output.json')

In [None]:
steam.count()

In [None]:
steam.printSchema()

In [None]:
steam.show()

In [None]:
steam.select('data').take(1)

In [None]:
from pyspark.sql.types import StructType, StructField
from typing import List, Dict, Generator, Union, Callable

def walkSchema(schema: Union[StructType, StructField]) -> Generator[str, None, None]:
    """Explores a PySpark schema:
    
    schema: StructType | StructField
    
    Yield
    -----
    A generator of strings, the name of each field in the schema
    """
    
    # we define a function _walk that produces a string generator from
    # a dictionnary "schema_dct", and a string "prefix"
    def _walk(schema_dct: Dict['str', Union['str', list, dict]],
              prefix: str = "") -> Generator[str, None, None]:
        assert isinstance(prefix, str), "prefix should be a string" # check if prefix is a string
        
        # this function returns "name" if there's no prefix and "prefix.name" if prefix exists
        fullName: Callable[str, str] = lambda name: ( 
            name if not prefix else f"{prefix}.{name}")
        
        # we get the next name one level lower from the dictionnary
        name = schema_dct.get('name', '')
        
        # if the type is struct then we search for the fields key
        # if fields is there we apply the function again and dig one level deeper in
        # the schema and set a prefix
        if schema_dct['type'] == 'struct':
            assert 'fields' in schema_dct, (
                "It's a StructType, we should have some fields")
            for field in schema_dct['fields']:
                yield from _walk(field, prefix=prefix)
        # if we have a dict type and we can't find fields then we
        # dig one level deeper and apply the _walk function again
        elif isinstance(schema_dct['type'], dict):
            assert 'fields' not in schema_dct, (
                "We're missing some keys here")
            yield from _walk(schema_dct['type'], prefix=fullName(name))
        # If we finally reached the end and found a name we yield the full name
        elif name:
            yield fullName(name)
    
    yield from _walk(schema.jsonValue())

In [None]:
col_names = walkSchema(steam.schema)

for col_name in walkSchema(steam.schema):
  print(col_name)

In [None]:
steam.select('id').distinct().count()

In [None]:
steam.select('data.appid').distinct().count()

In [None]:
steam.filter(F.col('data.appid') != F.col('id')).count()

In [None]:
steam_2 = steam.select('data')
steam_2.show(5)

In [None]:
steam_2.printSchema()

In [None]:
#1 publisher more games - OK
# We group the games by publisher and count the amount of games for each of them. Display in descending order.

steam_2.groupBy('data.publisher').count().orderBy('count',ascending=False).show(10)

In [None]:
#2 Best rated games
 
# Trouver le rating dans le schéma

In [None]:
#3 Quelles années + de release ?
#  Statistiques sortie COVID (2020)

steam_2.groupBy('data.release_date').count().orderBy('count',ascending=False).show()

# transfo data de "release_date" en year, car jour/mois/year actuellement.


In [None]:
#4 Distribution prix.
#  Rapport discount vs non discount

#tentative conversion price en double [semble fonctionner]
#steam_2.withColumn('data.price', col('data.price').cast('double'))

#visu différents prix
#steam_2.groupBy('data.price').count().orderBy('count', ascending = False).show(50)


In [None]:
#5 Most represented languages

In [None]:
#6 How many games prohibited 16-18yrs