# Agrupación y Apply usando Pandas

* Métodos groupby() y apply(): qué son/ para qué sirven, sintaxis básica y ejemplos.

In [23]:
import pandas as pd
import numpy as np

In [24]:
df = pd.read_csv('Data/online_sales_2020-2022.csv', index_col=0)
#index_col=0 quita la columna "unmamed" de los indices generada

In [25]:
df.head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount
0,221958,SKU_1964,White Mug,38,2020-01-01 00:00,1.71,37039.0,Australia,0.47
1,771155,SKU_1241,White Mug,18,2020-01-01 01:00,41.25,19144.0,Spain,0.19
2,231932,SKU_1501,Headphones,49,2020-01-01 02:00,29.11,50472.0,Germany,0.35


In [26]:
df.shape

(26304, 9)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26304 entries, 0 to 26303
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    26304 non-null  int64  
 1   StockCode    26304 non-null  object 
 2   Description  26304 non-null  object 
 3   Quantity     26304 non-null  int64  
 4   InvoiceDate  26304 non-null  object 
 5   UnitPrice    26304 non-null  float64
 6   CustomerID   23753 non-null  float64
 7   Country      26304 non-null  object 
 8   Discount     26304 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 2.0+ MB


In [28]:
df.describe().T
#df.describe() devuelve un resumen estadístico de las columnas numéricas y T lo devuelve la transpuesta
#df.describe(include='all') devuelve un resumen estadístico

#Para trabajarlo habría que eliminar las filas con valores negativos si bien el stock negativo indique falta del mismo
#df['price'].describe()

#Lo ideal seria dividir en datafames para trabajarlo 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
InvoiceNo,26304.0,552925.099719,259756.484885,100005.0,328855.5,554916.0,777752.75,999993.0
Quantity,26304.0,22.541286,17.744803,-50.0,11.0,23.0,37.0,49.0
UnitPrice,26304.0,47.256332,33.30759,-99.86,23.43,48.37,74.2,99.99
CustomerID,23753.0,54917.30253,25995.956967,10003.0,32469.0,55085.0,77326.0,99998.0
Discount,26304.0,0.275328,0.228697,0.0,0.13,0.26,0.38,1.998946


In [29]:
df.describe(include='O').T
#O de object, devuelve un resumen estadístico

Unnamed: 0,count,unique,top,freq
StockCode,26304,1000,SKU_1944,43
Description,26304,11,Wall Clock,2467
InvoiceDate,26304,26304,2020-01-01 00:00,1
Country,26304,12,France,2270


---
### Groupby()

*Sintaxis forma 1:

```python

df.groupby("columna/s_por_la/s_que_agrupamos")["columna/s_por_la/s_que_haremos_algún_cálculo"].operacion()

```

*Sintaxis forma 2:

```python

df.groupby("columna/s_por_la/s_que_agrupamos").agg({"columna1": "operacion1", "columna2": "operacion2"})

```

* Ejemplos agrupando por una columna

In [30]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount
0,221958,SKU_1964,White Mug,38,2020-01-01 00:00,1.71,37039.0,Australia,0.47
1,771155,SKU_1241,White Mug,18,2020-01-01 01:00,41.25,19144.0,Spain,0.19
2,231932,SKU_1501,Headphones,49,2020-01-01 02:00,29.11,50472.0,Germany,0.35
3,465838,SKU_1760,Desk Lamp,14,2020-01-01 03:00,76.68,96586.0,Netherlands,0.14
4,359178,SKU_1386,USB Cable,-30,2020-01-01 04:00,-68.11,,United Kingdom,1.501433


*Sintaxis forma 1:

In [32]:
#df.groupby('Country')['Quantity'].sum() #agrupando por paises y sumando las cantidades
df.groupby('Country')['Quantity'].sum().sort_values() #ordenado por defecto de menor a mayor

Country
Norway            47847
Australia         47995
Belgium           48071
Spain             48704
Italy             49428
Netherlands       49511
Germany           49795
Sweden            49802
United States     49994
United Kingdom    50214
Portugal          50458
France            51107
Name: Quantity, dtype: int64

In [None]:
df.groupby('Country').count() #Si no especifica ninguna columna cuenta todas las columnas

Unnamed: 0_level_0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Discount
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Australia,2174,2174,2174,2174,2174,2174,1961,2174
Belgium,2141,2141,2141,2141,2141,2141,1919,2141
France,2270,2270,2270,2270,2270,2270,2055,2270
Germany,2214,2214,2214,2214,2214,2214,1995,2214
Italy,2171,2171,2171,2171,2171,2171,1968,2171
Netherlands,2223,2223,2223,2223,2223,2223,2010,2223
Norway,2146,2146,2146,2146,2146,2146,1953,2146
Portugal,2214,2214,2214,2214,2214,2214,1990,2214
Spain,2147,2147,2147,2147,2147,2147,1942,2147
Sweden,2233,2233,2233,2233,2233,2233,2001,2233


In [36]:
df['Price'] =  df['Quantity'] * df['UnitPrice']
df.sample(7)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount,Price
20573,815720,SKU_1141,Notebook,35,2022-05-07 05:00,35.58,69413.0,Australia,0.38,1245.3
18095,783045,SKU_1748,Blue Pen,24,2022-01-23 23:00,23.17,81789.0,Norway,0.44,556.08
637,251836,SKU_1310,Blue Pen,12,2020-01-27 13:00,14.0,75607.0,Sweden,0.24,168.0
7676,294795,SKU_1479,Office Chair,44,2020-11-15 20:00,80.53,76954.0,Norway,0.35,3543.32
3792,128365,SKU_1926,Notebook,45,2020-06-07 00:00,74.28,,France,0.19,3342.6
21574,959076,SKU_1915,Wall Clock,2,2022-06-17 22:00,16.16,77468.0,Sweden,0.42,32.32
8927,544783,SKU_1941,Wall Clock,14,2021-01-06 23:00,45.11,15210.0,Netherlands,0.14,631.54


In [37]:
df['Description'].unique() #devuelve los valores únicos de la columna

array(['White Mug', 'Headphones', 'Desk Lamp', 'USB Cable',
       'Office Chair', 'Notebook', 'Wireless Mouse', 'Blue Pen',
       'Wall Clock', 'T-shirt', 'Backpack'], dtype=object)

In [None]:
df['Description'].value_counts() #devuelve la cantidad de veces que se repite cada valor	


Description
Wall Clock        2467
USB Cable         2423
Office Chair      2420
Headphones        2418
White Mug         2406
Backpack          2391
Desk Lamp         2369
Blue Pen          2369
Notebook          2350
Wireless Mouse    2350
T-shirt           2341
Name: count, dtype: int64

* Vamos a agrupar por descripcion pero trayendo varias columnas a la vez en el método de agregación (cant, pr unitario y precio) y mostrar la media para cada grupo:

In [41]:
df.groupby('Description')[['Quantity', 'UnitPrice', 'Price']].mean()

Unnamed: 0_level_0,Quantity,UnitPrice,Price
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Backpack,22.825596,47.343806,1208.035625
Blue Pen,22.654285,47.329152,1208.370979
Desk Lamp,22.753905,47.678995,1195.42374
Headphones,21.845327,47.155567,1185.648122
Notebook,22.037872,47.635362,1186.336077
Office Chair,22.359091,46.519963,1185.321591
T-shirt,22.614267,47.12097,1211.273238
USB Cable,22.824598,47.247652,1234.0896
Wall Clock,22.320227,45.645367,1186.478411
White Mug,23.055278,48.529206,1254.699131


* Agrupar por varias columnas a la vez, primero país, ventas totales producto.

In [52]:
df.groupby(['Country', 'Description'])['Quantity'].sum()


Country        Description   
Australia      Backpack          4420
               Blue Pen          4871
               Desk Lamp         4931
               Headphones        4053
               Notebook          3836
                                 ... 
United States  T-shirt           4405
               USB Cable         4663
               Wall Clock        4471
               White Mug         4818
               Wireless Mouse    4850
Name: Quantity, Length: 132, dtype: int64

In [54]:
df.groupby(['Country', 'Description'])['Quantity'].sum().reset_index()
#reset_index() devuelve un dataframe con los indices como columnas

Unnamed: 0,Country,Description,Quantity
0,Australia,Backpack,4420
1,Australia,Blue Pen,4871
2,Australia,Desk Lamp,4931
3,Australia,Headphones,4053
4,Australia,Notebook,3836
...,...,...,...
127,United States,T-shirt,4405
128,United States,USB Cable,4663
129,United States,Wall Clock,4471
130,United States,White Mug,4818


* Agrupamos por la columna país y varias columnas en el método de agregación, pero cambiando la sintaxis empleando el método .agg().
Dentro de esta funcion, aplicamos diferentes estadísticas o formas de agregación a cada columna:

*Sintaxis forma 2:

```python

df.groupby("columna/s_por_la/s_que_agrupamos").agg({"columna1": "operacion1", "columna2": "operacion2"})

```

In [58]:
df.groupby('Country').agg({'Quantity': 'sum', 'Price': 'mean'})


Unnamed: 0_level_0,Quantity,Price
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Australia,47995,1187.956624
Belgium,48071,1201.507562
France,51107,1195.333731
Germany,49795,1236.001536
Italy,49428,1184.807655
Netherlands,49511,1183.149073
Norway,47847,1182.342153
Portugal,50458,1192.670885
Spain,48704,1229.541476
Sweden,49802,1203.389955


In [59]:
#Transformo en vriable
agrupación_país = df.groupby('Country').agg({'Quantity': 'sum', 'Price': 'mean'})


In [63]:
agrupación_país.rename(columns={'Quantity': 'Cantidad_Total', 'Price': 'Precio_Prom'}).sort_values('Cantidad_Total', ascending=False) 

Unnamed: 0_level_0,Cantidad_Total,Precio_Prom
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
France,51107,1195.333731
Portugal,50458,1192.670885
United Kingdom,50214,1217.722897
United States,49994,1270.192668
Sweden,49802,1203.389955
Germany,49795,1236.001536
Netherlands,49511,1183.149073
Italy,49428,1184.807655
Spain,48704,1229.541476
Belgium,48071,1201.507562


---
### Apply()
Aplica la función a cada fila o columna del dataframe

* Sintaxis método apply()

df['columna_nueva'] = df['columna_existente'].apply(función)

In [64]:
df.sample(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount,Price
24010,782701,SKU_1910,Headphones,7,2022-09-27 10:00,80.05,24178.0,France,0.19,560.35
5622,766015,SKU_1292,White Mug,9,2020-08-22 06:00,78.44,50296.0,United Kingdom,0.3,705.96
4315,908258,SKU_1266,Office Chair,49,2020-06-28 19:00,41.12,,Italy,0.07,2014.88


* Aplicar una funcion definida por nosotros a todo el DF. Eligiremos las filas a la hora de defnir la funcion, es decir, le va a entrar una fila o resgistro como parámetro a nuestra funcion y al invocarla le pasaremos un número de fila como argumento.

Lo que la funcion va a hacer es calcular el precio final con descuento:

In [66]:
def calcular_precio_final(fila):
  return fila['Price'] * (1 - fila['Discount'])


In [65]:
print(df['Price'].isnull().sum()) #devuelve la cantidad de valores nulos en la columna Price
print(df['Discount'].isnull().sum()) #devuelve la cantidad de valores nulos en la columna Discount

0
0


In [None]:
#df['columna_nueva'] = df['columna_existente'].apply(función)

df['Precio_Final'] = df.apply(calcular_precio_final, axis=1)
#axis=1 indica que se aplica la función a cada fila


In [69]:
df.sample(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount,Price,Precio_Final
10232,421701,SKU_1148,Backpack,15,2021-03-02 08:00,84.7,85701.0,United Kingdom,0.3,1270.5,889.35
20516,646356,SKU_1687,Headphones,18,2022-05-04 20:00,81.98,73273.0,Australia,0.39,1475.64,900.1404
11613,236262,SKU_1931,Backpack,40,2021-04-28 21:00,82.2,37481.0,Portugal,0.06,3288.0,3090.72


* Comprobar si es una devilución o no, si es devolucion tendremos el precio negativo.

In [73]:
#Veo si hay valores negativos 
#df.UnitPrice.value_counts()
#df.UnitPrice.min()

#Lo correcto
#df[df['UnitPrice'] < 0]#devuelve la cantidad de filas con valores negativos en la columna UnitPrice, o directamente...
df[df['UnitPrice'] < 0].shape[0]

760

In [74]:
#Creo nueva columna y sobre la misma aplico la función lambda que va a ir comprobando si el valor es menor a 0, devuelve true o false
df['IsReturn'] = df['UnitPrice'].apply(lambda x: x < 0)

In [75]:
df.sample(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount,Price,Precio_Final,IsReturn
392,499111,SKU_1695,Wireless Mouse,49,2020-01-17 08:00,73.46,71315.0,United Kingdom,0.27,3599.54,2627.6642,False
8844,897653,SKU_1681,Blue Pen,-7,2021-01-03 12:00,-63.4,,Spain,1.25021,443.8,-111.04334,True
18205,593496,SKU_1912,Desk Lamp,29,2022-01-28 13:00,14.5,44771.0,Netherlands,0.14,420.5,361.63,False
24929,639899,SKU_1057,T-shirt,48,2022-11-04 17:00,37.11,14865.0,Spain,0.41,1781.28,1050.9552,False
26255,191612,SKU_1977,T-shirt,15,2022-12-29 23:00,66.82,46267.0,United Kingdom,0.39,1002.3,611.403,False


In [76]:
df.IsReturn.value_counts()

IsReturn
False    25544
True       760
Name: count, dtype: int64

* Crear una nueva columna categorizando los precios. Mayores de 50 son altos y menores bajos.

In [None]:
#precio_positivo = df[df['IsReturn'] == False]   o lo mismo...
precio_positivo = df[df['UnitPrice'] > 0]
precio_positivo.shape[0]

25544

In [82]:
precio_positivo['UnitPrice'].mean()

50.2196574538052

In [None]:
#df['PriceCategory'] = df['UnitPrice'].apply(lambda x: 'Alto' if x > 50 else 'Low') #Error del profe.. muesta valores negativos falta trabajar


In [None]:
#Una solucion sería filtrar los valores negativos antes de aplicar la función lambda
df = df[df['UnitPrice'] > 0]  # Filtrar filas con UnitPrice positivo
df['PriceCategory'] = df['UnitPrice'].apply(lambda x: 'Alto' if x > 50 else 'Low')

In [None]:
#Otra solución sería crear un nuevo dataframe con los valores positivos y luego aplicar la función lambda
df_unitprice_positive = df.loc[df['UnitPrice'] > 0]
df_unitprice_positive['PriceCategory'] = df_unitprice_positive['UnitPrice'].apply(lambda x: 'Alto' if x > 50 else 'Low')


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount,Price,Precio_Final,IsReturn,PriceCategory
2332,257795,SKU_1944,Blue Pen,13,2020-04-07 04:00,28.24,19070.0,Spain,0.01,367.12,363.4488,False,Low
25566,479673,SKU_1745,Backpack,26,2022-12-01 06:00,16.18,18217.0,Australia,0.32,420.68,286.0624,False,Low
232,218451,SKU_1318,Wall Clock,43,2020-01-10 16:00,88.21,36263.0,Germany,0.19,3793.03,3072.3543,False,Alto
12714,834769,SKU_1484,USB Cable,26,2021-06-13 18:00,98.22,15212.0,Australia,0.24,2553.72,1940.8272,False,Alto
5835,921935,SKU_1780,Headphones,9,2020-08-31 03:00,14.5,12037.0,Spain,0.44,130.5,73.08,False,Low


In [106]:
df_unitprice_positive.sample(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount,Price,Precio_Final,IsReturn,PriceCategory
10181,894042,SKU_1893,White Mug,43,2021-02-28 05:00,86.55,65576.0,Australia,0.03,3721.65,3610.0005,False,Alto
12046,343768,SKU_1174,Backpack,13,2021-05-16 22:00,55.14,67815.0,France,0.39,716.82,437.2602,False,Alto
4110,799539,SKU_1937,Backpack,1,2020-06-20 06:00,53.37,12743.0,Australia,0.28,53.37,38.4264,False,Alto
12833,855110,SKU_1281,White Mug,3,2021-06-18 17:00,46.11,31138.0,Italy,0.11,138.33,123.1137,False,Low
16686,326833,SKU_1085,USB Cable,33,2021-11-26 06:00,41.92,75755.0,United States,0.26,1383.36,1023.6864,False,Low
6465,717614,SKU_1079,Blue Pen,44,2020-09-26 09:00,4.11,86391.0,Belgium,0.36,180.84,115.7376,False,Low
15429,156815,SKU_1601,White Mug,29,2021-10-04 21:00,85.09,54441.0,Australia,0.23,2467.61,1900.0597,False,Alto
8265,106464,SKU_1059,Headphones,10,2020-12-10 09:00,51.67,29395.0,Germany,0.01,516.7,511.533,False,Alto
16558,306422,SKU_1600,Notebook,45,2021-11-20 22:00,1.62,65835.0,Netherlands,0.18,72.9,59.778,False,Low
23432,493661,SKU_1611,Backpack,18,2022-09-03 08:00,27.74,70476.0,Italy,0.28,499.32,359.5104,False,Low


In [107]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount,Price,Precio_Final,IsReturn,PriceCategory
0,221958,SKU_1964,White Mug,38,2020-01-01 00:00,1.71,37039.0,Australia,0.47,64.98,34.4394,False,Low
1,771155,SKU_1241,White Mug,18,2020-01-01 01:00,41.25,19144.0,Spain,0.19,742.5,601.425,False,Low
2,231932,SKU_1501,Headphones,49,2020-01-01 02:00,29.11,50472.0,Germany,0.35,1426.39,927.1535,False,Low
3,465838,SKU_1760,Desk Lamp,14,2020-01-01 03:00,76.68,96586.0,Netherlands,0.14,1073.52,923.2272,False,Alto
5,744167,SKU_1006,Office Chair,47,2020-01-01 05:00,70.16,53887.0,Sweden,0.48,3297.52,1714.7104,False,Alto
