<img src="https://www.iscte-iul.pt/assets/images/logo_iscte_detailed.svg" style="width: 450px;margin-top:30px;" align ="center">

<div style= "font-size: 35px;  margin-top:40px; font-weight:bold; font-family: 'Avenir Next LT Pro', sans-serif;"><center>Data Preparation & Feature Engineering: <strong>E-Commerce</strong></center></div>
<div style= "font-size: 30px; font-weight:bold; font-family: 'Avenir Next LT Pro', sans-serif;"><center>Organizing all variables and data to the models phase</center></div>

<div style= "font-size: 27px;font-weight:bold;line-height: 1.1; margin-top:40px; font-family: 'Avenir Next LT Pro', sans-serif;"><center>Processamento e Modelação de Big Data 2024/2025</center></div> <br>

   <div style= "font-size: 20px;font-weight:bold; font-family: 'Avenir Next LT Pro', sans-serif;"><center> Grupo 7:</center></div>
   <div><center> Diogo Freitas | 104841 </center></div>
   <div><center> João Francisco Botas | 104782 </center></div>
   <div><center> Miguel Gonçalves | 105944 </center></div>
   <div><center> Ricardo Galvão | 105285 </center></div>

In [1]:
# Importações Básicas
import pyspark
import pandas as pd
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F
from pyspark.sql.functions import split, isnan, col, isnull, avg, count, round
from pyspark.sql.types import (
    StructType, StructField,
    StringType, LongType, DoubleType, TimestampType
)


# Create a Spark session
spark = SparkSession.builder \
    .appName("PMBD") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [2]:
# Criação da Sessão
spark = SparkSession.builder.appName("Projeto").getOrCreate()

# Leitura de dados
data_dir = '../data/raw/'
file_ec = data_dir + '2019-Nov.csv'

# Carrega as primeiras 10.000 linhas
ec_pandas = pd.read_csv(file_ec, nrows=10000)

# Converte para Spark
ec = spark.createDataFrame(ec_pandas)

In [3]:
ec.show()

+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:...|      view|  17302664|2053013553853497655|                 NaN|   creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:...|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:...|      view|   1004775|2053013555631882655|elect

## Eliminação de observações com NaN na coluna category_code

In [4]:
ec.filter(F.isnull(col("category_code"))).head(5)

[]

In [5]:
# Remove observações com categoria nula
ec_clean = ec.filter(~isnull(col("category_code"))).filter(~isnan(col("category_code")))

In [6]:
ec.count(), ec_clean.count()

(10000, 6259)

In [7]:
ec_clean.show()

+--------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655|electronics.smart...| xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035|appliances.sewing...| janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:...|      view|   3601530|2053013563810775923|appliances.kitche...|     lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:...|      view|   1004775|2053013555631882655|electronics.smart...| xiaomi|183.27|558856683|313628f1-68b8-460...|
|2019-11-01 00:00:...|      view|   1306894|2053013558920217191|  computers.

---
## Separação da categoria do produto em sub-categorias

In [8]:
# Separação do category_code através do '.'
split_col = split(ec_clean['category_code'], '\.')

# Separação do category_code em categoria e sub categorias
ec_clean = ec_clean.withColumn('main_category', split_col.getItem(0)) \
       .withColumn('sub_category_1', split_col.getItem(1)) \
       .withColumn('sub_category_2', split_col.getItem(2))

# Remoção da category_code original 
ec_clean = ec_clean.drop('category_code')

In [9]:
ec_clean.show()

+--------------------+----------+----------+-------------------+-------+------+---------+--------------------+-------------+--------------+--------------+
|          event_time|event_type|product_id|        category_id|  brand| price|  user_id|        user_session|main_category|sub_category_1|sub_category_2|
+--------------------+----------+----------+-------------------+-------+------+---------+--------------------+-------------+--------------+--------------+
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655| xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|  electronics|    smartphone|          NULL|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035| janome|293.65|530496790|8e5f4f83-366c-4f7...|   appliances|sewing_machine|          NULL|
|2019-11-01 00:00:...|      view|   3601530|2053013563810775923|     lg|712.87|518085591|3bfb58cd-7892-48c...|   appliances|       kitchen|        washer|
|2019-11-01 00:00:...|      view|   1004775|2053013555631882655| xiaom

In [10]:
# Number of observations with sub categories
print(ec_clean.filter(~isnull(col("sub_category_1"))).count())
print(ec_clean.filter(~isnull(col("sub_category_2"))).count())

6259
2886


In [11]:
ec_clean.filter(col("main_category")=="medicine").show()

+--------------------+----------+----------+-------------------+-----+-----+---------+--------------------+-------------+--------------+--------------+
|          event_time|event_type|product_id|        category_id|brand|price|  user_id|        user_session|main_category|sub_category_1|sub_category_2|
+--------------------+----------+----------+-------------------+-----+-----+---------+--------------------+-------------+--------------+--------------+
|2019-11-01 00:44:...|      view|  25800007|2053013562292437791|omron|47.08|518484905|806e4279-33f2-43a...|     medicine|         tools|     tonometer|
+--------------------+----------+----------+-------------------+-----+-----+---------+--------------------+-------------+--------------+--------------+



In [12]:
top_10_cat = ec_clean.filter(col("event_type") == "view") \
  .repartition(24, 'main_category')\
  .groupBy("main_category") \
  .count() \
  .orderBy("count", ascending=False) \
  .select("main_category") \
  .collect()

top_10_cat = [x[0] for x in top_10_cat][:10]
top_10_cat

['electronics',
 'appliances',
 'furniture',
 'computers',
 'apparel',
 'construction',
 'auto',
 'kids',
 'sport',
 'accessories']

## Criação do dataframe para clientes

In [13]:
user_df = ec_clean.filter((col("event_type") == "view") & (col("main_category").isin(top_10_cat))) \
    .repartition(24, "user_id") \
    .groupBy("user_id") \
    .agg(
        avg("price").alias("average_price"),
        count("*").alias("views")
    ) \
    .withColumn("average_price", round(col("average_price"), 3))

user_df.show()

+---------+-------------+-----+
|  user_id|average_price|views|
+---------+-------------+-----+
|565404816|     1132.465|    2|
|519277091|      1091.33|    1|
|539701280|        345.2|   21|
|566280946|       180.18|    1|
|565921426|       165.26|    1|
|566280676|        30.89|    1|
|541781462|      203.877|   15|
|566281299|       165.26|    1|
|562958505|      171.543|    3|
|512458378|      731.833|    8|
|566281250|       287.83|    2|
|512461324|       128.42|    1|
|517119857|      334.783|    4|
|516161337|      266.113|    9|
|527396367|        42.99|    1|
|566282003|      1348.61|    1|
|543638253|      1029.37|    1|
|512712071|      282.522|   13|
|544248389|      195.246|    7|
|513590282|       59.133|    8|
+---------+-------------+-----+
only showing top 20 rows



In [14]:
avg_price_df = ec_clean.filter((col("event_type") == "view")) \
 .groupBy("user_id") \
 .agg(round(avg("price"), 3).alias("average_price"),
      count("*").alias("total_views"))

pivot_views_df = ec_clean.filter(
    (col("event_type") == "view") & 
    (col("main_category").isin(top_10_cat))
).groupBy("user_id") \
 .pivot("main_category", top_10_cat) \
 .agg(count("*"))

user_df = avg_price_df.join(pivot_views_df, on="user_id", how="left")
user_df = user_df.fillna(0)
user_df.show()

+---------+-------------+-----------+-----------+----------+---------+---------+-------+------------+----+----+-----+-----------+
|  user_id|average_price|total_views|electronics|appliances|furniture|computers|apparel|construction|auto|kids|sport|accessories|
+---------+-------------+-----------+-----------+----------+---------+---------+-------+------------+----+----+-----+-----------+
|515761354|       141.55|          3|          0|         3|        0|        0|      0|           0|   0|   0|    0|          0|
|565865924|       949.47|          1|          1|         0|        0|        0|      0|           0|   0|   0|    0|          0|
|527322328|      197.797|          7|          0|         7|        0|        0|      0|           0|   0|   0|    0|          0|
|553334193|       172.23|          1|          1|         0|        0|        0|      0|           0|   0|   0|    0|          0|
|565921426|       165.26|          1|          1|         0|        0|        0|      0|  

In [15]:
user_df.count()

1462

In [16]:
# write in a parquet file
user_df.write.parquet(data_dir + "user_df.parquet", mode="overwrite")

## Criação do dataframe para produtos

In [17]:
product_df 

NameError: name 'product_df' is not defined