In [10]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import *
from pyspark.ml.recommendation import ALS

In [18]:
schema = StructType([
    StructField('app_id',
                StringType(), True),
    StructField('app_name',
                StringType(), True),
    StructField('review_text',
                StringType(), True),
    StructField('review_score',
                IntegerType(), True),
    StructField('review_votes',
                IntegerType(), True)
])
df = spark.read.option("header",True).csv('dataset.csv', schema)

In [19]:
df.show(5)

+------+--------------+--------------------+------------+------------+
|app_id|      app_name|         review_text|review_score|review_votes|
+------+--------------+--------------------+------------+------------+
|    10|Counter-Strike|     Ruined my life.|           1|           0|
|    10|Counter-Strike|This will be more...|           1|           1|
|    10|Counter-Strike|This game saved m...|           1|           0|
|    10|Counter-Strike|• Do you like ori...|           1|           0|
|    10|Counter-Strike|        Easy to l...|           1|           1|
+------+--------------+--------------------+------------+------------+
only showing top 5 rows

23/05/14 04:47:38 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 972722 ms exceeds timeout 120000 ms
23/05/14 04:47:38 WARN SparkContext: Killing executors is not supported by current scheduler.


In [19]:
popular_apps = (df
    .select("app_name")
    .where(col("app_name").isNotNull())
    .groupBy("app_name")
    .count()
    .orderBy("count", ascending=False))

In [20]:
popular_apps.show(n=10, truncate=False)



+------------------+-----+
|app_name          |count|
+------------------+-----+
|PAYDAY 2          |88973|
|DayZ              |88850|
|Terraria          |84828|
|Rust              |77037|
|Dota 2            |73541|
|Rocket League     |54227|
|Undertale         |51918|
|Left 4 Dead 2     |50980|
|Warframe          |48229|
|Grand Theft Auto V|42374|
+------------------+-----+
only showing top 10 rows



                                                                                

In [27]:
popular_apps.write.json("../../data/apps_review_count", mode="overwrite")

                                                                                

In [21]:
highrated_apps = (df
    .select("*")
    .where(col("app_name").isNotNull())
    .groupBy("app_name")
    .sum("review_score")
    .withColumnRenamed('sum(review_score)', 'total_score')
    .orderBy("total_score", ascending=False))

In [22]:
highrated_apps.show(n=10, truncate=False)



+-------------+-----------+
|app_name     |total_score|
+-------------+-----------+
|Terraria     |79872      |
|Dota 2       |52302      |
|Rocket League|48012      |
|Undertale    |47782      |
|Rust         |45027      |
|Left 4 Dead 2|43384      |
|Warframe     |39265      |
|Portal 2     |37972      |
|PAYDAY 2     |34561      |
|Starbound    |31126      |
+-------------+-----------+
only showing top 10 rows



                                                                                

In [26]:
highrated_apps.write.json("../../data/highrated_apps", mode="overwrite")

                                                                                

In [14]:
import re
from pyspark.sql.functions import udf

def removePunctuation(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

removePunctuation_udf = udf(removePunctuation)

In [15]:
review_texts = df.where(col("app_name").isNotNull()).where(col("review_text").isNotNull())\
.withColumn("cleaned_review_text", removePunctuation_udf("review_text")).select("app_id", "app_name", "cleaned_review_text")\
.withColumn('words_list', split(col('cleaned_review_text'), ' ')).select("app_id", "app_name", "words_list")

In [16]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover()
remover.setInputCol("words_list")
remover.setOutputCol("words_list_out")
review_texts = remover.transform(review_texts)
review_texts_out = review_texts.withColumn('word', explode(col('words_list_out'))).select("app_id", "app_name", "word")

In [17]:
review_texts_out.show()

+------+--------------+-------------+
|app_id|      app_name|         word|
+------+--------------+-------------+
|    10|Counter-Strike|       ruined|
|    10|Counter-Strike|         life|
|    10|Counter-Strike|   experience|
|    10|Counter-Strike|         game|
|    10|Counter-Strike|         type|
|    10|Counter-Strike|       review|
|    10|Counter-Strike|       saying|
|    10|Counter-Strike|       things|
|    10|Counter-Strike|         like|
|    10|Counter-Strike|        great|
|    10|Counter-Strike|     gameplay|
|    10|Counter-Strike|         suit|
|    10|Counter-Strike|    something|
|    10|Counter-Strike|          ive|
|    10|Counter-Strike|  experienced|
|    10|Counter-Strike|counterstrike|
|    10|Counter-Strike|           go|
|    10|Counter-Strike|     remember|
|    10|Counter-Strike|         back|
|    10|Counter-Strike|         2002|
+------+--------------+-------------+
only showing top 20 rows



In [18]:
review_words = review_texts_out.groupBy("app_id", "app_name", "word").count()\
# .orderBy("app_id", "app_name", "count", ascending=False)

In [19]:
from pyspark.sql.window import Window

window_words = Window.partitionBy("app_id", "app_name").orderBy(col("count").desc())
review_frequent_words = review_words.withColumn("row",row_number().over(window_words)).filter(col("row") <= 10)\
                        .select("app_id", "app_name", "word", "count")

In [None]:
review_frequent_words.write.json("../../data/review_frequent_words", mode="overwrite")

In [22]:
review_frequent_word_list = review_frequent_words.groupBy("app_id", "app_name").agg(collect_set("word").alias("words"))

In [None]:
review_frequent_word_list.write.json("../../data/review_frequent_word_list", mode="overwrite")