In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import *

In [0]:
df= spark.read.json("/FileStore/export/steam_clean")


# Analyse des genres

## 7. Quels sont les genres les plus représentés ?

In [0]:
df.select('genre').show(truncate=False)

+--------------------------------------------------------------+
|genre                                                         |
+--------------------------------------------------------------+
|Action                                                        |
|Action, Adventure, Indie                                      |
|Adventure, Indie, RPG, Strategy                               |
|Action, Indie, Simulation, Strategy                           |
|Action, Casual, Indie, Simulation                             |
|Action, Adventure, Indie, RPG                                 |
|Adventure, Indie, RPG, Strategy                               |
|Action, Adventure, Casual, Free to Play, Massively Multiplayer|
|Casual, Indie                                                 |
|Indie, RPG                                                    |
|Action, RPG, Strategy                                         |
|Action, Early Access                                          |
|Indie                   

In [0]:
# Il faut 'explosed' la colonne car les genres sont stockées en chaine de caractére séparé par une virgule
df_exploded = df.withColumn("genre_exploded", explode(split(col("genre"), ",\\s*")))

In [0]:
df_genre = df_exploded\
    .groupBy("genre_exploded") \
    .count() \
    .orderBy(col("count").desc())

df_genre.show(5)

+--------------+-----+
|genre_exploded|count|
+--------------+-----+
|         Indie|39681|
|        Action|23759|
|        Casual|22086|
|     Adventure|21431|
|      Strategy|10895|
+--------------+-----+
only showing top 5 rows



### Les genres les plus représentés sont Indie puis Action et Casual.

##  8. Existe-t-il des genres qui ont un meilleur ratio de critiques positives/négatives ?

In [0]:
df.select('genre', 'positive', 'negative').show(5)

+--------------------+--------+--------+
|               genre|positive|negative|
+--------------------+--------+--------+
|              Action|  201215|    5199|
|Action, Adventure...|      27|       5|
|Adventure, Indie,...|    4032|     646|
|Action, Indie, Si...|    1575|     115|
|Action, Casual, I...|       0|       1|
+--------------------+--------+--------+
only showing top 5 rows



In [0]:
# df_exploded correspond au genre du dessus exploded
# calcul du ratio

df_ratio = df_exploded.groupBy("genre_exploded").agg(
    F.sum("positive").alias("sum_positive"),
    F.sum("negative").alias("sum_negative")
).withColumn(
    "genre_ratio",
    col("sum_positive") / (col("sum_positive") + col("sum_negative"))
)


In [0]:
df_ratio.orderBy("genre_ratio", ascending=False).show(5)


+--------------------+------------+------------+------------------+
|      genre_exploded|sum_positive|sum_negative|       genre_ratio|
+--------------------+------------+------------+------------------+
|       Photo Editing|      577751|       13745|0.9767623111567957|
|Animation & Modeling|      690765|       26392|0.9631991321286691|
|Design & Illustra...|      674057|       27007|0.9614771261967524|
|           Utilities|      739335|       43503|0.9444291156024618|
|    Game Development|       27461|        3274|0.8934764925980153|
+--------------------+------------+------------+------------------+
only showing top 5 rows



### Les genres avec un meilleur ratio de critique sont Photo Editing puis Animation & Modeling suivi de Design & Illustration.

## 9. Certains éditeurs ont-ils des genres favoris ?

In [0]:
df.select('publisher', 'genre').show(5)

+--------------------+--------------------+
|           publisher|               genre|
+--------------------+--------------------+
|               Valve|              Action|
|PsychoFlux Entert...|Action, Adventure...|
|Team17, NEXT Studios|Adventure, Indie,...|
| Vertigo Gaming Inc.|Action, Indie, Si...|
|       DoubleC Games|Action, Casual, I...|
+--------------------+--------------------+
only showing top 5 rows



In [0]:
df_editeur = df_exploded.groupBy("publisher","genre_exploded").count().orderBy(desc("count"))
df_editeur.show(10)


+--------------------+--------------+-----+
|           publisher|genre_exploded|count|
+--------------------+--------------+-----+
|      Big Fish Games|        Casual|  418|
|      Big Fish Games|     Adventure|  392|
|              8floor|        Casual|  202|
|     Choice of Games|           RPG|  139|
|     Choice of Games|         Indie|  136|
|            HH-Games|        Casual|  132|
|        Laush Studio|         Indie|  124|
|     Choice of Games|     Adventure|  112|
|                    |         Indie|  106|
|Alawar Entertainment|        Casual|  105|
+--------------------+--------------+-----+
only showing top 10 rows



### L'éditeur Big Fish Games a principalement des jeux de genre Casual et Adventure. 

## 10. Quels sont les genres les plus lucratifs ?

In [0]:
df.select("genre", "price_int").show(5)

+--------------------+---------+
|               genre|price_int|
+--------------------+---------+
|              Action|      999|
|Action, Adventure...|      999|
|Adventure, Indie,...|      599|
|Action, Indie, Si...|     1999|
|Action, Casual, I...|      199|
+--------------------+---------+
only showing top 5 rows



In [0]:
df_price = df_exploded.groupBy('genre_exploded').agg(
    F.sum('price_int').alias('sum_price')   
)
df_price.orderBy(col("sum_price").desc()).show(10, truncate=False)

+--------------+---------+
|genre_exploded|sum_price|
+--------------+---------+
|Indie         |26063036 |
|Action        |18358769 |
|Adventure     |17158179 |
|Casual        |12383583 |
|Simulation    |9851654  |
|Strategy      |9157201  |
|RPG           |8621295  |
|Early Access  |5375795  |
|Sports        |2385529  |
|Racing        |1771645  |
+--------------+---------+
only showing top 10 rows



### Le genre le plus lucratif semble être Indie.