In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("lab6") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
!wget https://archive.ics.uci.edu/static/public/911/recipe+reviews+and+user+feedback+dataset.zip

--2024-11-25 14:51:17--  https://archive.ics.uci.edu/static/public/911/recipe+reviews+and+user+feedback+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘recipe+reviews+and+user+feedback+dataset.zip’

recipe+reviews+and+     [  <=>               ]   2.02M  9.48MB/s    in 0.2s    

2024-11-25 14:51:18 (9.48 MB/s) - ‘recipe+reviews+and+user+feedback+dataset.zip’ saved [2114088]



In [3]:
!ls

recipe+reviews+and+user+feedback+dataset.zip  sample_data


In [4]:
!mv recipe+reviews+and+user+feedback+dataset.zip recipe_reviews.zip

In [5]:
import zipfile
with zipfile.ZipFile("recipe_reviews.zip", 'r') as zip_ref:
    zip_ref.extractall("./data")

In [6]:
df_reviews = spark.read.csv('./data/Recipe Reviews and User Feedback Dataset.csv', header=True, sep=",")

In [7]:
df_reviews = df_reviews.withColumnRenamed("_c0", "id")
df_reviews.show(5)

+---+-------------+-----------+------------------+--------------------+--------------+----------+---------------+----------+-----------+---------+-----------+-----+----------+--------------------+
| id|recipe_number|recipe_code|       recipe_name|          comment_id|       user_id| user_name|user_reputation|created_at|reply_count|thumbs_up|thumbs_down|stars|best_score|                text|
+---+-------------+-----------+------------------+--------------------+--------------+----------+---------------+----------+-----------+---------+-----------+-----+----------+--------------------+
|  0|          001|      14299|Creamy White Chili|sp_aUSaElGf_14299...|u_9iFLIhMa8QaG|   Jeri326|              1|1665619889|          0|        0|          0|    5|       527|I tweaked it a li...|
|  1|          001|      14299|Creamy White Chili|sp_aUSaElGf_14299...|u_Lu6p25tmE77j|   Mark467|             50|1665277687|          0|        7|          0|    5|       724|Bush used to have...|
|  2|          

In [8]:
df_reviews.orderBy(df_reviews.reply_count.desc()).show(10)

+---+-------------+-----------+--------------------+--------------------+--------------------+-----------------+---------------+----------+-----------+---------+-----------+-----+----------+--------------------+
| id|recipe_number|recipe_code|         recipe_name|          comment_id|             user_id|        user_name|user_reputation|created_at|reply_count|thumbs_up|thumbs_down|stars|best_score|                text|
+---+-------------+-----------+--------------------+--------------------+--------------------+-----------------+---------------+----------+-----------+---------+-----------+-----+----------+--------------------+
| 17|          002|       3309|Best Ever Banana ...|sp_aUSaElGf_3309_...|u_1tOHujEFJQIEVu0...|     OrangeBowtie|              0|1622648873|          3|        5|          0|    5|       354|The title is not ...|
|  8|          003|       2832|   Cheeseburger Soup|sp_aUSaElGf_2832_...|      u_inTFTX8AEJ0X|   ladypenny36619|              1|1659170678|          3| 

In [9]:
df_reviews.groupBy("recipe_code").agg({"best_score": "sum"}).orderBy("sum(best_score)", ascending=False).show(10)

+-----------+---------------+
|recipe_code|sum(best_score)|
+-----------+---------------+
|       2832|        98863.0|
|      14299|        85497.0|
|      17826|        64880.0|
|       3309|        64247.0|
|      21444|        60755.0|
|      32480|        59867.0|
|      12540|        59195.0|
|       2912|        54032.0|
|      42083|        51975.0|
|      19731|        47905.0|
+-----------+---------------+
only showing top 10 rows



In [10]:
df_reviews.groupBy("recipe_code").agg({"reply_count": "sum"}).orderBy("sum(reply_count)", ascending=False).show(10)

+-----------+----------------+
|recipe_code|sum(reply_count)|
+-----------+----------------+
|       2832|            16.0|
|        414|            13.0|
|      32480|            12.0|
|      14299|            10.0|
|      18345|            10.0|
|       8202|             9.0|
|      12003|             8.0|
|      41095|             8.0|
|       1324|             8.0|
|       8431|             6.0|
+-----------+----------------+
only showing top 10 rows



In [11]:
df_reviews.select("stars").groupBy("stars").count().show()

+-----+-----+
|stars|count|
+-----+-----+
|    3|  490|
|    0| 1696|
| NULL|   86|
|    5|13829|
|    1|  280|
|    4| 1655|
|    2|  232|
+-----+-----+



In [12]:
header = ['id', 'firstname', 'lastname', 'age', 'salary']
firstnames = ['Adam', 'Katarzyna', 'Krzysztof', 'Marek', 'Aleksandra', 'Zbigniew', 'Wojciech', 'Mieczysław', 'Agata', 'Wisława']
lastnames = ['Mieczykowski', 'Kowalski', 'Malinowski' , 'Szczaw', 'Glut', 'Barański', 'Brzęczyszczykiewicz', 'Wróblewski', 'Wlotka', 'Pysla']
age = {'min': 18, 'max': 68}
salary = {'min': 3200, 'max': 12500}

In [13]:
import random
from tqdm import tqdm

def build_dataset(filename, n_rows=100, chunk_size=100000):
    rows = []
    rows.append(header)
    mu = (salary['max'] + salary['min']) / 2
    sigma = 1000

    with open(filename, 'w', encoding='utf-8') as filehandler:

        for id in tqdm(range(1, n_rows + 1), total=n_rows, desc="Building dataset..."):
            row = [
                f'{id}',
                f'{random.choice(firstnames)}',
                f'{random.choice(lastnames)}',
                f"{random.randint(age['min'], age['max'])}",
                f"{round(float(random.normalvariate(mu=mu, sigma=sigma)), 2)}"
            ]
            rows.append(row)
            if id % chunk_size == 0:
                filehandler.writelines([f"{','.join(row)}\n" for row in rows])
                rows = []

In [14]:
build_dataset('employee.csv', 5_000_000)

Building dataset...: 100%|██████████| 5000000/5000000 [00:40<00:00, 124319.67it/s]


In [15]:
%%time
df = spark.read.csv('employee.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: double (nullable = true)

CPU times: user 92.3 ms, sys: 9.46 ms, total: 102 ms
Wall time: 15.3 s


In [16]:
%%time
df.filter(df["salary"] > 10000).count()

CPU times: user 66.6 ms, sys: 3.23 ms, total: 69.8 ms
Wall time: 11.1 s


79452

In [17]:
from pyspark.sql.functions import col
df = df.withColumn("salary", col("salary").cast("decimal(10,2)"))

In [18]:
%%time
df.filter(df["salary"] > 10000).count()

CPU times: user 76.9 ms, sys: 4.31 ms, total: 81.2 ms
Wall time: 12.5 s


79452

In [19]:
from pyspark.ml.feature import Bucketizer

splits = [10, 20, 30, 40, 50, 60, 70]
bucketizer = Bucketizer(splits=splits, inputCol="age", outputCol="age_bucket")

df_with_buckets = bucketizer.transform(df)
df_with_buckets.show(20)

+---+----------+-------------------+---+-------+----------+
| id| firstname|           lastname|age| salary|age_bucket|
+---+----------+-------------------+---+-------+----------+
|  1|     Agata|           Kowalski| 33|7016.12|       2.0|
|  2|     Agata|             Szczaw| 37|9162.18|       2.0|
|  3|      Adam|             Szczaw| 51|7497.92|       4.0|
|  4|  Wojciech|         Wróblewski| 21|7709.72|       1.0|
|  5|Aleksandra|              Pysla| 44|7644.78|       3.0|
|  6|   Wisława|Brzęczyszczykiewicz| 63|5937.37|       5.0|
|  7|     Agata|         Malinowski| 26|8525.23|       1.0|
|  8|     Agata|              Pysla| 24|8427.28|       1.0|
|  9| Krzysztof|       Mieczykowski| 57|7966.37|       4.0|
| 10|   Wisława|             Szczaw| 23|8301.28|       1.0|
| 11| Krzysztof|             Szczaw| 29|8131.70|       1.0|
| 12|Mieczysław|           Kowalski| 53|7815.40|       4.0|
| 13|  Wojciech|               Glut| 37|7933.12|       2.0|
| 14|      Adam|         Malinowski| 65|

In [20]:
df_with_buckets.groupBy("age_bucket").count().show()

+----------+------+
|age_bucket| count|
+----------+------+
|       0.0|195398|
|       1.0|980538|
|       4.0|980220|
|       3.0|980931|
|       2.0|980093|
|       5.0|882820|
+----------+------+

