# EJERCICIOS SPARK, Ejercicio 5 - Julia Hernández Elena

In [1]:
import os
import pandas as pd

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window


In [2]:
conf = (

    SparkConf()
    .setAppName(u"[ICAI] Ejercicios Spark")
    .set("spark.executor.memory", "7g")
    .set("spark.executor.cores", "5")
    .set("spark.default.parallelism", 600)
    .set("spark.sql.shuffle.partitions", 600) 
    .set("spark.dynamicAllocation.maxExecutors", 2) 
)

In [3]:
spark = (

    SparkSession.builder
    .config(conf=conf)
    .enableHiveSupport()
    .getOrCreate()

)

# EJERCICIO 5

### Dado los siguientes datos (/datos/categorias.parquet):
### 1. ¿Cuántos usuarios distintos hay? ¿Cuántas categorías?

Leeemos el fichero:

In [9]:
categorias = spark.read.parquet('/datos/categorias.parquet').cache()

In [None]:
categorias.show(5)

Numero de usuarios distintos:

In [18]:
categorias.select(F.countDistinct('user_id')).show()

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|                1626967|
+-----------------------+



Numero de categorias:

In [20]:
categorias.agg(F.countDistinct('category')).show()

+------------------------+
|count(DISTINCT category)|
+------------------------+
|                      32|
+------------------------+



### 2. Generar un nuevo DF con la siguiente estructura, donde cada columna es el resultado de pivotar la variable category para cada una de las tres variables.

In [12]:
pivot = (
    categorias
    .filter("category is null or category!=''") #filtramos aquellas entradas que no tengan categoria
    .groupBy("user_id")
    .pivot("category")
    .agg(
        F.sum("variable1").alias("variable1"), 
        F.sum("variable2").alias("variable2"), 
        F.sum("variable3").alias("variable3")
    )
    .na.fill(0) #rellenamos los NA con 0s

    
).cache()

In [13]:
pivot.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- Advertising_variable1: double (nullable = false)
 |-- Advertising_variable2: double (nullable = false)
 |-- Advertising_variable3: double (nullable = false)
 |-- Arts_and_Entertainment_variable1: double (nullable = false)
 |-- Arts_and_Entertainment_variable2: double (nullable = false)
 |-- Arts_and_Entertainment_variable3: double (nullable = false)
 |-- Automotive_variable1: double (nullable = false)
 |-- Automotive_variable2: double (nullable = false)
 |-- Automotive_variable3: double (nullable = false)
 |-- Business_variable1: double (nullable = false)
 |-- Business_variable2: double (nullable = false)
 |-- Business_variable3: double (nullable = false)
 |-- Careers_variable1: double (nullable = false)
 |-- Careers_variable2: double (nullable = false)
 |-- Careers_variable3: double (nullable = false)
 |-- Competitors_variable1: double (nullable = false)
 |-- Competitors_variable2: double (nullable = false)
 |-- Competitors_variable3: d

In [14]:
pivot.limit(20).toPandas()

Unnamed: 0,user_id,Advertising_variable1,Advertising_variable2,Advertising_variable3,Arts_and_Entertainment_variable1,Arts_and_Entertainment_variable2,Arts_and_Entertainment_variable3,Automotive_variable1,Automotive_variable2,Automotive_variable3,...,Sports_variable3,Style_and_Fashion_variable1,Style_and_Fashion_variable2,Style_and_Fashion_variable3,Technology_and_Computing_variable1,Technology_and_Computing_variable2,Technology_and_Computing_variable3,Travel_variable1,Travel_variable2,Travel_variable3
0,5e4bcd70e657b36b99fd625ddae4fbd1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0538df3ee7ed377b35d551ff18af924f,11.831671,8.786504,8.08631,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42b702407027eff1e6654beb875e0247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6ae586b8c04760850dbc93763f46850e,9.095111,9.383799,5.604129,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,e7abb6528148fa3643cf98bdb61d8a87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,9ba5cdde143f442e495578d014a7674f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.903214,9.55408,15.71119,0.0,0.0,0.0
6,95223047c838712efe012b75d98de796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,51a779da0ad6e94a95e892fda91bf843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,c3c6b3c5aaf73de352fd6e043e9c534a,9.344895,7.188289,13.777309,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9bde92cfee4271ba537bc682e851ac53,1.279364,12.768015,8.109204,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.844541,11.646564,7.347495,0.0,0.0,0.0


In [5]:
spark.stop()