# Advanced Data Wrangling with Pandas

In [1]:
import pandas as pd
import numpy as np

## Formas não usuais de se ler um dataset

Você não precisa que o arquivo com os seus dados esteja no seu disco local, o pandas está preparado para adquirir arquivos via http, s3, gs...

In [2]:
diamonds = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


Você também pode crawlear uma tabela de uma página da internet de forma simples

In [3]:
clarity = pd.read_html("https://www.brilliantearth.com/diamond-clarity/")

clarity

[            0                                           1
 0          FL                                    Flawless
 1          IF                         Internally Flawless
 2  VVS1  VVS2                 Very Very Slightly Included
 3    VS1  VS2                      Very Slightly Included
 4    SI1  SI2                           Slightly Included
 5  I1  I2  I3  Included  (Not Carried by Brilliant Earth)]

In [4]:
clarity = clarity[0]
clarity

Unnamed: 0,0,1
0,FL,Flawless
1,IF,Internally Flawless
2,VVS1 VVS2,Very Very Slightly Included
3,VS1 VS2,Very Slightly Included
4,SI1 SI2,Slightly Included
5,I1 I2 I3,Included (Not Carried by Brilliant Earth)


In [5]:
clarity.columns = ['clarity', 'clarity_description']
clarity

Unnamed: 0,clarity,clarity_description
0,FL,Flawless
1,IF,Internally Flawless
2,VVS1 VVS2,Very Very Slightly Included
3,VS1 VS2,Very Slightly Included
4,SI1 SI2,Slightly Included
5,I1 I2 I3,Included (Not Carried by Brilliant Earth)


## Como explodir a coluna de um dataframe

In [6]:
clarity['clarity'] = clarity['clarity'].str.split()
clarity

Unnamed: 0,clarity,clarity_description
0,[FL],Flawless
1,[IF],Internally Flawless
2,"[VVS1, VVS2]",Very Very Slightly Included
3,"[VS1, VS2]",Very Slightly Included
4,"[SI1, SI2]",Slightly Included
5,"[I1, I2, I3]",Included (Not Carried by Brilliant Earth)


In [7]:
type(clarity.loc[0, 'clarity'])

list

In [8]:
clarity = clarity.explode("clarity")
clarity

Unnamed: 0,clarity,clarity_description
0,FL,Flawless
1,IF,Internally Flawless
2,VVS1,Very Very Slightly Included
2,VVS2,Very Very Slightly Included
3,VS1,Very Slightly Included
3,VS2,Very Slightly Included
4,SI1,Slightly Included
4,SI2,Slightly Included
5,I1,Included (Not Carried by Brilliant Earth)
5,I2,Included (Not Carried by Brilliant Earth)


## Como validar o merge

Esse parametro serve para validar a relação entre as duas tabelas que você está juntando. Por exemplo, se a relação é 1 para 1, 1 para muitos, muitos para 1 ou muitos para muitos.

In [9]:
diamonds.merge(clarity, on='clarity', validate="m:1")

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,clarity_description
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,Slightly Included
1,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,Slightly Included
2,0.31,Ideal,J,SI2,62.2,54.0,344,4.35,4.37,2.71,Slightly Included
3,0.20,Premium,E,SI2,60.2,62.0,345,3.79,3.75,2.27,Slightly Included
4,0.30,Ideal,I,SI2,62.0,54.0,348,4.31,4.34,2.68,Slightly Included
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.54,Ideal,F,IF,62.2,54.0,2729,5.24,5.27,3.27,Internally Flawless
53936,0.51,Ideal,F,IF,62.5,54.0,2730,5.12,5.16,3.21,Internally Flawless
53937,0.52,Ideal,F,IF,61.7,57.0,2733,5.13,5.17,3.18,Internally Flawless
53938,0.52,Ideal,F,IF,61.5,57.0,2749,5.15,5.19,3.18,Internally Flawless


In [10]:
clarity_with_problem = clarity.append(pd.Series({"clarity": "SI2", "clarity_description": "slightly included"}), ignore_index=True)
clarity_with_problem

Unnamed: 0,clarity,clarity_description
0,FL,Flawless
1,IF,Internally Flawless
2,VVS1,Very Very Slightly Included
3,VVS2,Very Very Slightly Included
4,VS1,Very Slightly Included
5,VS2,Very Slightly Included
6,SI1,Slightly Included
7,SI2,Slightly Included
8,I1,Included (Not Carried by Brilliant Earth)
9,I2,Included (Not Carried by Brilliant Earth)


In [11]:
diamonds.merge(clarity_with_problem, on='clarity', validate="m:1")

MergeError: Merge keys are not unique in right dataset; not a many-to-one merge

In [12]:
diamonds.merge(clarity_with_problem, on='clarity')

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,clarity_description
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,Slightly Included
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,slightly included
2,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,Slightly Included
3,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,slightly included
4,0.31,Ideal,J,SI2,62.2,54.0,344,4.35,4.37,2.71,Slightly Included
...,...,...,...,...,...,...,...,...,...,...,...
63129,0.54,Ideal,F,IF,62.2,54.0,2729,5.24,5.27,3.27,Internally Flawless
63130,0.51,Ideal,F,IF,62.5,54.0,2730,5.12,5.16,3.21,Internally Flawless
63131,0.52,Ideal,F,IF,61.7,57.0,2733,5.13,5.17,3.18,Internally Flawless
63132,0.52,Ideal,F,IF,61.5,57.0,2749,5.15,5.19,3.18,Internally Flawless


### Por que isso é importante?

O que aconteceria seu tivesse keys duplicadas no meu depara. Ele duplicou as minhas linhas que tinham a key duplicada, o dataset foi de 53,940 linhas para 63,134 linhas

## Como usar o método `.assign`

Para adicionar ou modificar colunas do dataframe. Você pode passar como argumento uma constante para a coluna ou um função que tenha como input um `pd.DataFrame` e output uma `pd.Series`.

In [13]:
diamonds.assign(foo="bar", bar="foo")

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,foo,bar
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,bar,foo
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,bar,foo
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,bar,foo
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,bar,foo
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,bar,foo
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,bar,foo
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,bar,foo
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,bar,foo
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,bar,foo


In [14]:
diamonds.assign(volume=lambda df: df['x'] * df['y'] * df['z'])

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,38.202030
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,34.505856
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,38.076885
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,46.724580
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,51.917250
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,115.920000
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,118.110175
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,114.449728
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,140.766120


In [15]:
def calculate_volume(df):
    return df['x'] * df['y'] * df['z']


diamonds.assign(volume=calculate_volume)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,38.202030
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,34.505856
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,38.076885
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,46.724580
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,51.917250
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,115.920000
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,118.110175
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,114.449728
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,140.766120


In [16]:
diamonds['volume'] = diamonds['x'] * diamonds['y'] * diamonds['z']
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,38.202030
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,34.505856
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,38.076885
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,46.724580
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,51.917250
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,115.920000
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,118.110175
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,114.449728
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,140.766120


## Como usar o método `.query`
Para filtrar. Tende a ser util quando você quer filtrar o dataframe baseado em algum estado intermediário

In [17]:
diamonds = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [18]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [19]:
diamonds[(diamonds['x'] == 0) | (diamonds['y'] == 0) | (diamonds['z'] == 0)]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2207,1.0,Premium,G,SI2,59.1,59.0,3142,6.55,6.48,0.0
2314,1.01,Premium,H,I1,58.1,59.0,3167,6.66,6.6,0.0
4791,1.1,Premium,G,SI2,63.0,59.0,3696,6.5,6.47,0.0
5471,1.01,Premium,F,SI2,59.2,58.0,3837,6.5,6.47,0.0
10167,1.5,Good,G,I1,64.0,61.0,4731,7.15,7.04,0.0
11182,1.07,Ideal,F,SI2,61.6,56.0,4954,0.0,6.62,0.0
11963,1.0,Very Good,H,VS2,63.3,53.0,5139,0.0,0.0,0.0
13601,1.15,Ideal,G,VS2,59.2,56.0,5564,6.88,6.83,0.0
15951,1.14,Fair,G,VS1,57.5,67.0,6381,0.0,0.0,0.0
24394,2.18,Premium,H,SI2,59.4,61.0,12631,8.49,8.45,0.0


In [20]:
diamonds.query("x == 0 | y == 0 | z == 0")


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2207,1.0,Premium,G,SI2,59.1,59.0,3142,6.55,6.48,0.0
2314,1.01,Premium,H,I1,58.1,59.0,3167,6.66,6.6,0.0
4791,1.1,Premium,G,SI2,63.0,59.0,3696,6.5,6.47,0.0
5471,1.01,Premium,F,SI2,59.2,58.0,3837,6.5,6.47,0.0
10167,1.5,Good,G,I1,64.0,61.0,4731,7.15,7.04,0.0
11182,1.07,Ideal,F,SI2,61.6,56.0,4954,0.0,6.62,0.0
11963,1.0,Very Good,H,VS2,63.3,53.0,5139,0.0,0.0,0.0
13601,1.15,Ideal,G,VS2,59.2,56.0,5564,6.88,6.83,0.0
15951,1.14,Fair,G,VS1,57.5,67.0,6381,0.0,0.0,0.0
24394,2.18,Premium,H,SI2,59.4,61.0,12631,8.49,8.45,0.0


In [21]:
x = diamonds \
    .assign(volume=lambda df: df['x'] * df['y'] * df['z'])

x = x[x['volume'] > 0]

In [22]:
diamonds = diamonds \
    .assign(volume=lambda df: df['x'] * df['y'] * df['z']) \
    .query("volume > 0")

diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,38.202030
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,34.505856
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,38.076885
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,46.724580
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,51.917250
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,115.920000
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,118.110175
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,114.449728
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,140.766120


Você também pode usar variáveis externas ao dataframe dentro da sua query, basta usar @ como marcador.

In [23]:
selected_cut = "Premium"
diamonds.query("cut == @selected_cut")

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,34.505856
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,46.724580
12,0.22,Premium,F,SI1,60.4,61.0,342,3.88,3.84,2.33,34.715136
14,0.20,Premium,E,SI2,60.2,62.0,345,3.79,3.75,2.27,32.262375
15,0.32,Premium,E,I1,60.9,58.0,345,4.38,4.42,2.68,51.883728
...,...,...,...,...,...,...,...,...,...,...,...
53928,0.79,Premium,E,SI2,61.4,58.0,2756,6.03,5.96,3.68,132.254784
53930,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49,115.988754
53931,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43,112.813386
53934,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58,116.721246


Quase qualquer string que seria um código python válido, vai ser uma query valida

In [24]:
diamonds.query("clarity.str.startswith('SI')")

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,38.202030
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,34.505856
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,51.917250
7,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53,42.321081
10,0.30,Good,J,SI1,64.0,55.0,339,4.25,4.28,2.73,49.658700
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,115.920000
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,118.110175
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,114.449728
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,140.766120


Porém o parser do pandas tem algumas particularidades, como o `==` que também pode ser um `isin`

In [25]:
diamonds.query("color == ['E', 'J']")

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,38.202030
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,34.505856
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,38.076885
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,51.917250
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48,38.693952
...,...,...,...,...,...,...,...,...,...,...,...
53926,0.71,Ideal,E,SI1,61.9,56.0,2756,5.71,5.73,3.54,115.822782
53928,0.79,Premium,E,SI2,61.4,58.0,2756,6.03,5.96,3.68,132.254784
53930,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49,115.988754
53932,0.70,Very Good,E,VS2,60.5,59.0,2757,5.71,5.76,3.47,114.126912


In [26]:
diamonds = diamonds.query("x != 0 & y != 0 & z != 0")

Exemplo de que precisamos do estado intermediário para fazer um filtro. Você cria uma nova coluna e quer filtrar baseado nela sem precisar salvar esse resultado em uma variável intermerdiária

## Como usar o método `.loc` e `.iloc`
Uma das desvantagens do `.query` é que fica mais difícil fazer análise estática do código, os editores geralmente não suportam syntax highlighting. Um jeito de solucionar esse problemas é usando o `.loc` ou `.iloc`, que além de aceitarem mascaras, eles aceitam funções também.

In [27]:
diamonds.loc[[0, 1, 2], ['clarity', 'depth']]

Unnamed: 0,clarity,depth
0,SI2,61.5
1,SI1,59.8
2,VS1,56.9


In [28]:
diamonds.iloc[[0, 1, 2], [3, 4]]

Unnamed: 0,clarity,depth
0,SI2,61.5
1,SI1,59.8
2,VS1,56.9


In [29]:
diamonds.sort_values("depth")

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
10377,1.09,Ideal,J,VS2,43.0,54.0,4778,6.53,6.55,4.12,176.218580
4518,1.00,Fair,G,SI1,43.0,59.0,3634,6.32,6.27,3.97,157.316808
6341,1.00,Fair,G,VS2,44.0,53.0,4032,6.31,6.24,4.12,162.222528
16857,1.43,Fair,I,VS1,50.8,60.0,6727,7.73,7.25,3.93,220.247025
36503,0.30,Fair,E,VVS2,51.0,67.0,945,4.67,4.62,2.37,51.133698
...,...,...,...,...,...,...,...,...,...,...,...
53540,0.90,Fair,G,SI1,72.9,54.0,2691,5.74,5.67,4.16,135.390528
46679,0.99,Fair,J,I1,73.6,60.0,1789,6.01,5.80,4.35,151.632300
41918,1.03,Fair,E,I1,78.2,54.0,1262,5.72,5.59,4.42,141.328616
52861,0.50,Fair,E,VS2,79.0,73.0,2579,5.21,5.18,4.09,110.380102


In [30]:
diamonds.sort_values("depth").loc[[0, 1, 2]]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,38.20203
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,34.505856
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,38.076885


In [31]:
diamonds.sort_values("depth").iloc[[0, 1, 2]]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
10377,1.09,Ideal,J,VS2,43.0,54.0,4778,6.53,6.55,4.12,176.21858
4518,1.0,Fair,G,SI1,43.0,59.0,3634,6.32,6.27,3.97,157.316808
6341,1.0,Fair,G,VS2,44.0,53.0,4032,6.31,6.24,4.12,162.222528


In [32]:
diamonds.loc[diamonds["price"] > 6000]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
14909,1.26,Very Good,G,SI2,61.3,55.0,6001,6.94,7.00,4.27,207.436600
14910,1.21,Premium,J,VVS1,61.3,59.0,6001,6.86,6.81,4.19,195.742554
14911,1.07,Very Good,D,SI1,60.2,55.0,6002,6.64,6.68,4.01,177.864352
14912,2.03,Fair,H,I1,64.4,59.0,6002,7.91,7.85,5.07,314.814045
14913,2.03,Fair,H,I1,66.6,57.0,6002,7.81,7.75,5.19,314.137725
...,...,...,...,...,...,...,...,...,...,...,...
27745,2.00,Very Good,H,SI1,62.8,57.0,18803,7.95,8.00,5.01,318.636000
27746,2.07,Ideal,G,SI2,62.5,55.0,18804,8.20,8.13,5.11,340.663260
27747,1.51,Ideal,G,IF,61.7,55.0,18806,7.37,7.41,4.56,249.029352
27748,2.00,Very Good,G,SI1,63.5,56.0,18818,7.90,7.97,5.04,317.333520


In [33]:
diamonds["price"] > 6000

0        False
1        False
2        False
3        False
4        False
         ...  
53935    False
53936    False
53937    False
53938    False
53939    False
Name: price, Length: 53920, dtype: bool

In [34]:
diamonds.loc[lambda x: x['price'] > 6000]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
14909,1.26,Very Good,G,SI2,61.3,55.0,6001,6.94,7.00,4.27,207.436600
14910,1.21,Premium,J,VVS1,61.3,59.0,6001,6.86,6.81,4.19,195.742554
14911,1.07,Very Good,D,SI1,60.2,55.0,6002,6.64,6.68,4.01,177.864352
14912,2.03,Fair,H,I1,64.4,59.0,6002,7.91,7.85,5.07,314.814045
14913,2.03,Fair,H,I1,66.6,57.0,6002,7.81,7.75,5.19,314.137725
...,...,...,...,...,...,...,...,...,...,...,...
27745,2.00,Very Good,H,SI1,62.8,57.0,18803,7.95,8.00,5.01,318.636000
27746,2.07,Ideal,G,SI2,62.5,55.0,18804,8.20,8.13,5.11,340.663260
27747,1.51,Ideal,G,IF,61.7,55.0,18806,7.37,7.41,4.56,249.029352
27748,2.00,Very Good,G,SI1,63.5,56.0,18818,7.90,7.97,5.04,317.333520


In [35]:
diamonds[diamonds['price'] > 10000]['price'] = 10000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [36]:
diamonds.query("price > 10000")

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
21928,1.70,Ideal,J,VS2,60.5,58.0,10002,7.73,7.74,4.68,280.005336
21929,1.03,Ideal,E,VVS2,60.6,59.0,10003,6.50,6.53,3.95,167.657750
21930,1.23,Very Good,G,VVS2,60.6,55.0,10004,6.93,7.02,4.23,205.783578
21931,1.25,Ideal,F,VS2,61.6,55.0,10006,6.93,6.96,4.28,206.436384
21932,2.01,Very Good,I,SI2,61.4,63.0,10009,8.19,7.96,4.96,323.354304
...,...,...,...,...,...,...,...,...,...,...,...
27745,2.00,Very Good,H,SI1,62.8,57.0,18803,7.95,8.00,5.01,318.636000
27746,2.07,Ideal,G,SI2,62.5,55.0,18804,8.20,8.13,5.11,340.663260
27747,1.51,Ideal,G,IF,61.7,55.0,18806,7.37,7.41,4.56,249.029352
27748,2.00,Very Good,G,SI1,63.5,56.0,18818,7.90,7.97,5.04,317.333520


In [37]:
diamonds.loc[diamonds['price'] > 10000, 'price'] = 10000

In [38]:
diamonds.query("price > 10000")

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume


## O que o `.groupby(...) retorna`

In [39]:
diamonds = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv") \
    .assign(volume=lambda x: x['x'] * x['y'] * x['z']) \
    .query("volume > 0")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,38.20203
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,34.505856
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,38.076885
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,46.72458
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,51.91725


In [40]:
grouped_diamonds = diamonds.groupby("cut")
grouped_diamonds

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11b5e1210>

In [41]:
list(grouped_diamonds)

[('Fair',
         carat   cut color clarity  depth  table  price     x     y     z  \
  8       0.22  Fair     E     VS2   65.1   61.0    337  3.87  3.78  2.49   
  91      0.86  Fair     E     SI2   55.1   69.0   2757  6.45  6.33  3.52   
  97      0.96  Fair     F     SI2   66.3   62.0   2759  6.27  5.95  4.07   
  123     0.70  Fair     F     VS2   64.5   57.0   2762  5.57  5.53  3.58   
  124     0.70  Fair     F     VS2   65.3   55.0   2762  5.63  5.58  3.66   
  ...      ...   ...   ...     ...    ...    ...    ...   ...   ...   ...   
  53757   0.72  Fair     F     VS2   55.4   64.0   2724  6.06  5.97  3.34   
  53800   0.90  Fair     I     VS1   68.7   62.0   2732  5.83  5.79  3.99   
  53863   1.00  Fair     I     SI2   66.8   56.0   2743  6.22  6.12  4.13   
  53879   1.04  Fair     G     SI2   65.2   57.0   2745  6.25  6.23  4.07   
  53882   0.71  Fair     D     VS1   65.4   59.0   2747  5.62  5.58  3.66   
  
             volume  
  8       36.425214  
  91     143.716320

## Os N formatos de agregação do pandas

A função `.agg` é um *alias* da função `.aggregate`, então elas tem o mesmo resultado.

O Pandas tem algumas funções padrão que permitem que você passe só o nome delas, ao invés do *callable*:
* "all"
* "any"
* "count"
* "first"
* "idxmax"
* "idxmin"
* "last"
* "mad"
* "max"
* "mean"
* "median"
* "min"
* "nunique"
* "prod"
* "sem"
* "size"
* "skew"
* "std"
* "sum"
* "var"

Você pode passar uma lista de callable e o pandas vai aplicar todas as funções para todas as colunas. Faz sentido se são muitas funções e poucas colunas. Um problema é que ele vai nomear as novas colunas com base na coluna anterior e na função, quando você usa uma lambda isso causa um problema.

In [42]:
diamonds.groupby('clarity').agg(['mean', 'sum', np.max, lambda x: x.min()])

Unnamed: 0_level_0,carat,carat,carat,carat,depth,depth,depth,depth,table,table,...,y,y,z,z,z,z,volume,volume,volume,volume
Unnamed: 0_level_1,mean,sum,amax,<lambda_0>,mean,sum,amax,<lambda_0>,mean,sum,...,amax,<lambda_0>,mean,sum,amax,<lambda_0>,mean,sum,amax,<lambda_0>
clarity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
I1,1.284146,947.7,5.01,0.3,62.742005,46303.6,78.2,55.6,58.298238,43024.1,...,10.54,4.29,4.225014,3118.06,6.98,2.6,205.357818,151554.1,790.133208,50.340147
IF,0.505123,904.17,2.29,0.23,61.510615,110104.0,65.6,52.3,56.507207,101147.9,...,8.49,3.88,3.061659,5480.37,5.2,2.39,83.259187,149033.9,373.0506,37.530864
SI1,0.850272,11107.1,2.57,0.21,61.853135,807987.5,72.9,43.0,57.662413,753244.1,...,8.87,3.84,3.640403,47554.58,5.49,1.07,138.108842,1804116.0,420.118155,34.505856
SI2,1.077308,9895.07,3.04,0.2,61.772041,567376.2,72.2,53.1,57.926238,532052.5,...,58.9,3.75,3.952347,36302.31,8.06,2.27,175.075438,1608068.0,3840.59806,32.262375
VS1,0.727108,5940.47,2.59,0.23,61.667968,503827.3,71.8,50.8,57.313966,468255.1,...,31.8,3.85,3.441428,28116.47,31.8,1.41,118.858683,971075.4,838.5024,36.27393
VS2,0.763716,9358.58,3.51,0.2,61.724376,756370.5,79.0,43.0,57.418516,703606.5,...,9.63,3.68,3.492618,42798.54,6.03,2.24,124.366027,1523981.0,560.945574,31.707984
VVS1,0.503131,1838.44,2.31,0.23,61.624521,225176.0,67.6,54.0,56.883881,207853.7,...,8.42,3.85,3.062132,11189.03,5.25,2.31,82.57186,301717.6,375.30045,36.603465
VVS2,0.596202,3020.36,2.07,0.23,61.663778,312388.7,67.6,51.0,57.02499,288888.6,...,8.17,3.86,3.221465,16319.94,5.11,2.06,97.617606,494530.8,338.999444,36.164763


Você também pode passar um dicionário de listas, assim você pode escolher qual função será aplicada em cada coluna, você ainda tem o problema de nome das novas colunas ao usar uma função anônima.

In [43]:
diamonds.groupby('clarity').agg({"x": 'mean', 'price': [np.max, 'max', max, lambda x: x.max()]})

Unnamed: 0_level_0,x,price,price,price,price
Unnamed: 0_level_1,mean,amax,max,max,<lambda_0>
clarity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
I1,6.760772,18531,18531,18531,18531
IF,4.968402,18806,18806,18806,18806
SI1,5.887987,18818,18818,18818,18818
SI2,6.403621,18804,18804,18804,18804
VS1,5.57286,18795,18795,18795,18795
VS2,5.65834,18823,18823,18823,18823
VVS1,4.961721,18777,18777,18777,18777
VVS2,5.218454,18768,18768,18768,18768


A terceira opção é o NamedAgg foi lançada recentemente. Ela resolve o problema de nomes de colunas. Você passa como parâmetro uma tupla para cada agregação que você quer. O primeiro elemento é o nome da coluna e o segundo é a função.

\* *O Dask ainda não aceita esse tipo de agregação*

In [44]:
diamonds.groupby('clarity').agg(max_price=('price', 'max'), total_cost=('price', lambda x: x.sum()))

Unnamed: 0_level_0,max_price,total_cost
clarity,Unnamed: 1_level_1,Unnamed: 2_level_1
I1,18531,2897528
IF,18806,5128062
SI1,18818,52175093
SI2,18804,46480143
VS1,18795,31365809
VS2,18823,48070810
VVS1,18777,9206298
VVS2,18768,16635412


## `.groupby(...).apply(...)`

Um problema comum a todas essas abordagens é que você não consegue fazer uma agregação que depende de duas colunas. Para a maior parte dos casos existe uma forma razoável de resolver esse problema criando uma nova coluna e aplicando a agregação nela. Porém, se isso não foi possível, dá para usar o `.groupby(...).apply()`.

In [45]:
# Nesse caso ao invés da função de agregação receber a pd.Series relativa ao grupo,
# ela vai receber o subset do grupo. Aqui vamos printar cada grupo do df de forma 
# separada

diamonds.groupby('cut').apply(lambda x: print(x.head().to_string() + "\n"))

     carat   cut color clarity  depth  table  price     x     y     z      volume
8     0.22  Fair     E     VS2   65.1   61.0    337  3.87  3.78  2.49   36.425214
91    0.86  Fair     E     SI2   55.1   69.0   2757  6.45  6.33  3.52  143.716320
97    0.96  Fair     F     SI2   66.3   62.0   2759  6.27  5.95  4.07  151.837455
123   0.70  Fair     F     VS2   64.5   57.0   2762  5.57  5.53  3.58  110.271518
124   0.70  Fair     F     VS2   65.3   55.0   2762  5.63  5.58  3.66  114.980364

    carat   cut color clarity  depth  table  price     x     y     z     volume
2    0.23  Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31  38.076885
4    0.31  Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75  51.917250
10   0.30  Good     J     SI1   64.0   55.0    339  4.25  4.28  2.73  49.658700
17   0.30  Good     J     SI1   63.4   54.0    351  4.23  4.29  2.70  48.996090
18   0.30  Good     J     SI1   63.8   56.0    351  4.23  4.26  2.71  48.833658

    carat    cut color cla

Esse formato de agregação introduz algumas complexidades, porque sua função pode retornar tanto um pd.DataFrame, pd.Series ou um escalar. O pandas vai tentar fazer um broadcasting do que você retorna para algo que ele acha que faz sentido. Exemplos:

Se você retornar um escalar, o apply vai retornar uma `pd.Series` em que cada elemento corresponde a um grupo do .groupby

In [46]:
# Retornando um escalar
def returning_scalar(df: pd.DataFrame) -> float:
    return (df["x"] * df["y"] * df['z']).mean()


diamonds.groupby("cut").apply(returning_scalar)

cut
Fair         165.053067
Good         136.368452
Ideal        115.410977
Premium      145.167917
Very Good    131.010566
dtype: float64

Se você retornar uma `pd.Series` nomeada, o apply vai retornar um `pd.DataFrame` em que cada linha corresponde a um grupo do `.groupby` e cada coluna corresponde a uma key do pd.Series que você retorna na sua função de agregação

In [47]:
def returning_named_series(df: pd.DataFrame) -> pd.Series:
    volume = (df["x"] * df["y"] * df['z'])
    price_to_volume = df['price'] / volume
    return pd.Series({"mean_volume": volume.mean(), "mean_price_to_volume": price_to_volume.mean()})


diamonds.groupby("cut").apply(returning_named_series)

Unnamed: 0_level_0,mean_volume,mean_price_to_volume
cut,Unnamed: 1_level_1,Unnamed: 2_level_1
Fair,165.053067,23.819577
Good,136.368452,24.004827
Ideal,115.410977,23.860648
Premium,145.167917,25.957197
Very Good,131.010566,24.696247


Se você retornar um `pd.DataFrame`, o apply vai retornar uma concatenação dos desses `pd.DataFrame`

In [48]:
def returning_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    return df[df['volume'] >= df['volume'].median()]
    

diamonds.groupby("cut").apply(returning_dataframe)

Unnamed: 0_level_0,Unnamed: 1_level_0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Fair,97,0.96,Fair,F,SI2,66.3,62.0,2759,6.27,5.95,4.07,151.837455
Fair,241,1.01,Fair,E,I1,64.5,58.0,2788,6.29,6.21,4.03,157.415427
Fair,255,1.05,Fair,J,SI2,65.8,59.0,2789,6.41,6.27,4.18,167.997126
Fair,298,1.01,Fair,E,SI2,67.4,60.0,2797,6.19,6.05,4.13,154.666435
Fair,369,1.00,Fair,G,I1,66.4,59.0,2808,6.16,6.09,4.07,152.683608
...,...,...,...,...,...,...,...,...,...,...,...,...
Very Good,53885,0.75,Very Good,I,VVS2,62.0,59.0,2749,5.81,5.83,3.61,122.279003
Very Good,53902,0.77,Very Good,E,SI2,59.9,61.0,2753,5.98,6.01,3.59,129.023882
Very Good,53903,0.75,Very Good,E,SI1,62.9,57.0,2753,5.79,5.84,3.66,123.757776
Very Good,53906,0.75,Very Good,E,SI1,63.0,55.0,2753,5.76,5.79,3.64,121.395456


Se você retornar uma `pd.Series` não nomeada, o apply vai retornar uma `pd.Series` que é uma concatenação das `pd.Series` que você retorna da sua função

In [49]:
def returning_unnamed_series(df: pd.DataFrame) -> pd.Series:
    return df.loc[df['volume'] >= df['volume'].median(), 'volume']


diamonds.groupby("cut").apply(returning_unnamed_series)

cut             
Fair       97       151.837455
           241      157.415427
           255      167.997126
           298      154.666435
           369      152.683608
                       ...    
Very Good  53885    122.279003
           53902    129.023882
           53903    123.757776
           53906    121.395456
           53917    143.865288
Name: volume, Length: 26965, dtype: float64

De forma resumida, o `.groupby(...).apply(...)` é extremamente flexível, ele consegue filtrar, agregar e tranformar. Mas é mais complicado de usar e é bem lento se comparado aos outros métodos de agregação. Só use se necessário.

| Saída da Função       | Saída do apply                                                                                                                                                     |
|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Escalar               | Uma pd.Series em que cada elemento corresponde a um grupo do .groupby                                                                                              |
| pd.Series nomeada     | Um pd.DataFrame em que cada linha corresponde a um grupo do .groupby e cada coluna corresponde a uma key do pd.Series que você retorna na sua função de agregação  |
| pd.Series não nomeada | Uma `pd.Series` que é uma concatenação das `pd.Series` que você retorna da sua função                                                                              |
| pd.DataFrame          | Uma concatenação dos desses `pd.DataFrame`                                                                                                                         |

## Como usar o método `.pipe`

O `.pipe` aplica uma função ao dataframe

In [50]:
def change_basis(df: pd.DataFrame, factor=10):
    df[['x', 'y', 'z']] = df[['x', 'y', 'z']] * factor
    return df


diamonds.pipe(change_basis)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,39.5,39.8,24.3,38.202030
1,0.21,Premium,E,SI1,59.8,61.0,326,38.9,38.4,23.1,34.505856
2,0.23,Good,E,VS1,56.9,65.0,327,40.5,40.7,23.1,38.076885
3,0.29,Premium,I,VS2,62.4,58.0,334,42.0,42.3,26.3,46.724580
4,0.31,Good,J,SI2,63.3,58.0,335,43.4,43.5,27.5,51.917250
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,57.5,57.6,35.0,115.920000
53936,0.72,Good,D,SI1,63.1,55.0,2757,56.9,57.5,36.1,118.110175
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,56.6,56.8,35.6,114.449728
53938,0.86,Premium,H,SI2,61.0,58.0,2757,61.5,61.2,37.4,140.766120


Nós não atribuimos o resultado da nossa operação a nenhuma variável, então teoricamente se rodarmos de novo, o resultado vai ser o mesmo.

In [51]:
diamonds.pipe(change_basis)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,395.0,398.0,243.0,38.202030
1,0.21,Premium,E,SI1,59.8,61.0,326,389.0,384.0,231.0,34.505856
2,0.23,Good,E,VS1,56.9,65.0,327,405.0,407.0,231.0,38.076885
3,0.29,Premium,I,VS2,62.4,58.0,334,420.0,423.0,263.0,46.724580
4,0.31,Good,J,SI2,63.3,58.0,335,434.0,435.0,275.0,51.917250
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,575.0,576.0,350.0,115.920000
53936,0.72,Good,D,SI1,63.1,55.0,2757,569.0,575.0,361.0,118.110175
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,566.0,568.0,356.0,114.449728
53938,0.86,Premium,H,SI2,61.0,58.0,2757,615.0,612.0,374.0,140.766120


Isso acontece porque a sua função está alterando o `pd.DataFrame` original ao invés de criar uma cópia, isso é um pouco contra intuitivo porque o Pandas por padrão faz as suas operações em copias da tabela. Para evitar isso podemos fazer uma cópia do dataframe manualmente

In [52]:
diamonds = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv")

In [53]:
def change_basis(df: pd.DataFrame, factor=10):
    df = df.copy()
    df[['x', 'y', 'z']] = df[['x', 'y', 'z']] * factor
    return df


diamonds.pipe(change_basis, factor=10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,39.5,39.8,24.3
1,0.21,Premium,E,SI1,59.8,61.0,326,38.9,38.4,23.1
2,0.23,Good,E,VS1,56.9,65.0,327,40.5,40.7,23.1
3,0.29,Premium,I,VS2,62.4,58.0,334,42.0,42.3,26.3
4,0.31,Good,J,SI2,63.3,58.0,335,43.4,43.5,27.5
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,57.5,57.6,35.0
53936,0.72,Good,D,SI1,63.1,55.0,2757,56.9,57.5,36.1
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,56.6,56.8,35.6
53938,0.86,Premium,H,SI2,61.0,58.0,2757,61.5,61.2,37.4


In [54]:
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


## Como combinar o `.assign`, `.pipe`, `.query` e `.loc` para um Pandas mais idiomático

Os métodos mais importantes para *Method Chaining* são
* `.assign`
* `.query`
* `.loc`
* `.pipe`

In [55]:
diamonds = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [56]:
diamonds_cp = diamonds.copy()
diamonds_cp[['x', 'y', 'z']] = diamonds_cp[['x', 'y', 'z']] * 10
diamonds_cp['volume'] = diamonds_cp['x'] * diamonds_cp['y'] * diamonds_cp['z']
diamonds_cp = diamonds_cp[diamonds_cp['volume'] > 0]
diamonds_cp = pd.merge(diamonds_cp, clarity, on='clarity', how='left')

diamonds_cp

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume,clarity_description
0,0.23,Ideal,E,SI2,61.5,55.0,326,39.5,39.8,24.3,38202.030,Slightly Included
1,0.21,Premium,E,SI1,59.8,61.0,326,38.9,38.4,23.1,34505.856,Slightly Included
2,0.23,Good,E,VS1,56.9,65.0,327,40.5,40.7,23.1,38076.885,Very Slightly Included
3,0.29,Premium,I,VS2,62.4,58.0,334,42.0,42.3,26.3,46724.580,Very Slightly Included
4,0.31,Good,J,SI2,63.3,58.0,335,43.4,43.5,27.5,51917.250,Slightly Included
...,...,...,...,...,...,...,...,...,...,...,...,...
53915,0.72,Ideal,D,SI1,60.8,57.0,2757,57.5,57.6,35.0,115920.000,Slightly Included
53916,0.72,Good,D,SI1,63.1,55.0,2757,56.9,57.5,36.1,118110.175,Slightly Included
53917,0.70,Very Good,D,SI1,62.8,60.0,2757,56.6,56.8,35.6,114449.728,Slightly Included
53918,0.86,Premium,H,SI2,61.0,58.0,2757,61.5,61.2,37.4,140766.120,Slightly Included


In [57]:
def change_basis(df: pd.DataFrame, factor=10):
    df = df.copy()
    df[['x', 'y', 'z']] = df[['x', 'y', 'z']] * factor
    return df


diamonds \
    .copy() \
    .pipe(change_basis, factor=10) \
    .assign(volume=lambda df: df['x'] * df['y'] * df['z']) \
    .query("volume > 0") \
    .merge(clarity, on='clarity', how='left')


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume,clarity_description
0,0.23,Ideal,E,SI2,61.5,55.0,326,39.5,39.8,24.3,38202.030,Slightly Included
1,0.21,Premium,E,SI1,59.8,61.0,326,38.9,38.4,23.1,34505.856,Slightly Included
2,0.23,Good,E,VS1,56.9,65.0,327,40.5,40.7,23.1,38076.885,Very Slightly Included
3,0.29,Premium,I,VS2,62.4,58.0,334,42.0,42.3,26.3,46724.580,Very Slightly Included
4,0.31,Good,J,SI2,63.3,58.0,335,43.4,43.5,27.5,51917.250,Slightly Included
...,...,...,...,...,...,...,...,...,...,...,...,...
53915,0.72,Ideal,D,SI1,60.8,57.0,2757,57.5,57.6,35.0,115920.000,Slightly Included
53916,0.72,Good,D,SI1,63.1,55.0,2757,56.9,57.5,36.1,118110.175,Slightly Included
53917,0.70,Very Good,D,SI1,62.8,60.0,2757,56.6,56.8,35.6,114449.728,Slightly Included
53918,0.86,Premium,H,SI2,61.0,58.0,2757,61.5,61.2,37.4,140766.120,Slightly Included


Um problema que pode acontecer quando você usa o method chaining é você acabar com um bloco gigantesco que é impossível de debugar, uma boa prática é quebrar seus blocos por objetivos

## Como mandar um dataframe para a sua clipboard
Geralmente isso não é uma boa pratica, mas as vezes é útil para enviar uma parte do dado por mensagem ou para colar em alguma planilha.

In [59]:
df = pd.DataFrame({'a':list('abc'), 'b':np.random.randn(3)})

In [60]:
df

Unnamed: 0,a,b
0,a,0.239057
1,b,-1.256323
2,c,0.265837


In [61]:
df.to_clipboard()

In [62]:
df.to_csv("df.csv")

Você também pode ler da sua *clipboard* com `pd.read_clipboard(...)`. O que é uma prática pior ainda, mas em alguns casos pode ser útil.

## Recursos
https://pandas.pydata.org/docs/user_guide/cookbook.html

https://tomaugspurger.github.io/modern-1-intro.html