In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('agrupaciones').getOrCreate()

In [3]:
df = spark.read.csv('ventas.csv', inferSchema=True, header=True)

In [4]:
df.show()

+-------+-------+------+
|Empresa|Persona|Ventas|
+-------+-------+------+
|   GOOG| Carlos|   200|
|   GOOG|   Juan|   120|
|   GOOG| Felipe|   340|
|   MSFT|   Tina|   600|
|   MSFT| Andrea|   124|
|   MSFT|  Carla|   243|
|     FB|   Sara|   870|
|     FB|Ignacio|   350|
|   APPL| Miguel|   250|
|   APPL|  Oscar|   130|
|   APPL|  Jorge|   750|
|   APPL|   Ivan|   350|
+-------+-------+------+



In [6]:
df.printSchema()

root
 |-- Empresa: string (nullable = true)
 |-- Persona: string (nullable = true)
 |-- Ventas: integer (nullable = true)



# groupBy() agrupa los datos por un valor

In [10]:
df.groupBy('Empresa').sum().show()

+-------+-----------+
|Empresa|sum(Ventas)|
+-------+-----------+
|   APPL|       1480|
|   GOOG|        660|
|     FB|       1220|
|   MSFT|        967|
+-------+-----------+



# agg() agrega los datos

In [12]:
df.agg({'Ventas':'sum'}).show()

+-----------+
|sum(Ventas)|
+-----------+
|       4327|
+-----------+



In [13]:
agrupado = df.groupBy('Empresa')

In [15]:
agrupado.agg({'Ventas':'sum'}).show()

+-------+-----------+
|Empresa|sum(Ventas)|
+-------+-----------+
|   APPL|       1480|
|   GOOG|        660|
|     FB|       1220|
|   MSFT|        967|
+-------+-----------+



In [16]:
from pyspark.sql.functions import countDistinct, avg, stddev

In [17]:
df.select(countDistinct('Ventas')).show()

+----------------------+
|count(DISTINCT Ventas)|
+----------------------+
|                    11|
+----------------------+



In [19]:
df.select(avg('Ventas').alias('Media de Ventas')).show()

+-----------------+
|  Media de Ventas|
+-----------------+
|360.5833333333333|
+-----------------+



In [20]:
df.select(stddev('Ventas').alias('Desviación estándar de Ventas')).show()

+-----------------------------+
|Desviación estándar de Ventas|
+-----------------------------+
|           250.08742410799007|
+-----------------------------+



In [21]:
from pyspark.sql.functions import format_number

In [24]:
ventas_stddev = df.select(stddev('Ventas'))

In [25]:
ventas_stddev.show()

+-------------------+
|stddev_samp(Ventas)|
+-------------------+
| 250.08742410799007|
+-------------------+



In [27]:
ventas_stddev.select(format_number('stddev_samp(Ventas)', 2).alias('Stdev de Ventas')).show()

+---------------+
|Stdev de Ventas|
+---------------+
|         250.09|
+---------------+



# orderBy() ordena el dataframe

In [28]:
df.orderBy('Ventas').show()

+-------+-------+------+
|Empresa|Persona|Ventas|
+-------+-------+------+
|   GOOG|   Juan|   120|
|   MSFT| Andrea|   124|
|   APPL|  Oscar|   130|
|   GOOG| Carlos|   200|
|   MSFT|  Carla|   243|
|   APPL| Miguel|   250|
|   GOOG| Felipe|   340|
|     FB|Ignacio|   350|
|   APPL|   Ivan|   350|
|   MSFT|   Tina|   600|
|   APPL|  Jorge|   750|
|     FB|   Sara|   870|
+-------+-------+------+



In [29]:
df.orderBy(df['Ventas'].desc()).show()

+-------+-------+------+
|Empresa|Persona|Ventas|
+-------+-------+------+
|     FB|   Sara|   870|
|   APPL|  Jorge|   750|
|   MSFT|   Tina|   600|
|     FB|Ignacio|   350|
|   APPL|   Ivan|   350|
|   GOOG| Felipe|   340|
|   APPL| Miguel|   250|
|   MSFT|  Carla|   243|
|   GOOG| Carlos|   200|
|   APPL|  Oscar|   130|
|   MSFT| Andrea|   124|
|   GOOG|   Juan|   120|
+-------+-------+------+

