<a href="https://colab.research.google.com/github/M-I-N1985/python_para_analise_de_dados_wes_mckinney/blob/main/capitulo_7___limpeza_e_preparacao_dos_dados_wes_mckinney.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Tratando dados ausentes**

In [1]:
import pandas as pd
import numpy as np
from numpy import nan as NA 

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data[0] = None

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [7]:
tabela1 = {'Argumento': ['dropna', 'fillna', 'isnull', 'notnull'], 
           'Descrição': ['Filtra rótulos de eixos, baseado no fato de os valores para cada rótulo terem dados ausentes, com limites variados para a quantidade de dados ausentes a ser tolerada.', 
                                                                                 'Preenche os dados ausentes com algum valor ou utilizando um método de interpolação como "ffill" ou "bfill".', 
                                                                                 'Devolve valores booleanos informando quais valores estão ausentes/são NA.', 
                                                                                 'Negação de isnull.']}

In [8]:
tabela1 = pd.DataFrame(tabela1)
tabela1

Unnamed: 0,Argumento,Descrição
0,dropna,"Filtra rótulos de eixos, baseado no fato de os..."
1,fillna,Preenche os dados ausentes com algum valor ou ...
2,isnull,Devolve valores booleanos informando quais val...
3,notnull,Negação de isnull.


###**Filtrando dados ausentes**

In [9]:
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [14]:
data.dropna(how='all')  # por default axis=0, descarta somente a linha em que todos os elementos são NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [15]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
data.dropna(axis=1, how='all')  # axis=1 descarta somente a coluna em que todos os elementos são NA, poderia ser axis='columns'

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,-0.944909,-0.904458,-1.085292
1,-0.024063,0.803737,-1.23668
2,-0.987381,1.224809,-3.0993
3,-0.513016,1.202099,1.863271
4,-0.068378,2.424702,-0.034121
5,-0.507682,0.489792,0.596614
6,-0.062905,-0.527406,-0.47736


In [18]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.944909,,
1,-0.024063,,
2,-0.987381,,-3.0993
3,-0.513016,,1.863271
4,-0.068378,2.424702,-0.034121
5,-0.507682,0.489792,0.596614
6,-0.062905,-0.527406,-0.47736


In [19]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.068378,2.424702,-0.034121
5,-0.507682,0.489792,0.596614
6,-0.062905,-0.527406,-0.47736


In [20]:
df.dropna(thresh=2)  # como dropna exclui toda linha se houver algum elemento com NA, então o argumento fresh indica até qual linha vai essa exclusão

Unnamed: 0,0,1,2
2,-0.987381,,-3.0993
3,-0.513016,,1.863271
4,-0.068378,2.424702,-0.034121
5,-0.507682,0.489792,0.596614
6,-0.062905,-0.527406,-0.47736


###**Preenchendo dados ausentes**

In [21]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.944909,0.0,0.0
1,-0.024063,0.0,0.0
2,-0.987381,0.0,-3.0993
3,-0.513016,0.0,1.863271
4,-0.068378,2.424702,-0.034121
5,-0.507682,0.489792,0.596614
6,-0.062905,-0.527406,-0.47736


In [22]:
df.fillna({1:0.5, 2:0})  # preenchendo com valores diferentes para as colunas selecionadas

Unnamed: 0,0,1,2
0,-0.944909,0.5,0.0
1,-0.024063,0.5,0.0
2,-0.987381,0.5,-3.0993
3,-0.513016,0.5,1.863271
4,-0.068378,2.424702,-0.034121
5,-0.507682,0.489792,0.596614
6,-0.062905,-0.527406,-0.47736


In [23]:
df = pd.DataFrame(np.random.randn(6,3))
df.iloc[2:, 1] = np.nan  # np.nan retorna nan
df.iloc[4:,2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.573548,0.59635,0.827616
1,0.96068,0.72799,0.611095
2,0.249723,,0.510812
3,0.083073,,0.795984
4,0.844392,,
5,-0.158858,,


In [24]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.573548,0.59635,0.827616
1,0.96068,0.72799,0.611095
2,0.249723,0.72799,0.510812
3,0.083073,0.72799,0.795984
4,0.844392,0.72799,0.795984
5,-0.158858,0.72799,0.795984


In [25]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.573548,0.59635,0.827616
1,0.96068,0.72799,0.611095
2,0.249723,0.72799,0.510812
3,0.083073,0.72799,0.795984
4,0.844392,,0.795984
5,-0.158858,,0.795984


In [26]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [27]:
tabela2 = {'Argumento': ['value', 'method', 'axis', 'inplace', 'limit'], 
           'Descrição': ['Valor escalar ou um objeto do tipo dicionário a ser usado para preencher valores ausentes',
                         'Interpolação; por padrão, será "ffill" se a função for chamada sem outros argumentos',
                         'Eixo a ser preenchido; o default é axis=0',
                         'Modifica o objeto que faz a chamada, sem gerar uma cópia',
                         'Para preenchimento para a frente (forward) e para trás (backward), é o número máximo de valores consecutivos a serem preenchidos']}

In [28]:
tabela2 = pd.DataFrame(tabela2)
tabela2

Unnamed: 0,Argumento,Descrição
0,value,Valor escalar ou um objeto do tipo dicionário ...
1,method,"Interpolação; por padrão, será ""ffill"" se a fu..."
2,axis,Eixo a ser preenchido; o default é axis=0
3,inplace,"Modifica o objeto que faz a chamada, sem gerar..."
4,limit,Para preenchimento para a frente (forward) e p...


#**Transformação de dados**

###**Removendo duplicatas**

In [29]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 'k2':[1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [30]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [31]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [32]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [33]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [34]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


###**Transformando dados usando uma função ou um mapeamento**

In [35]:
carnes = pd.DataFrame({'comida':['bacon', 'carne de porco desfiada', 'bacon',
                                 'Pastrami', 'carne enlatada', 'Bacon',
                                 'pastrami', 'presunto', 'nova lox'], 
                       'gramas': [100, 75, 300, 150, 190, 200, 75, 125, 150]})

carnes

Unnamed: 0,comida,gramas
0,bacon,100
1,carne de porco desfiada,75
2,bacon,300
3,Pastrami,150
4,carne enlatada,190
5,Bacon,200
6,pastrami,75
7,presunto,125
8,nova lox,150


In [36]:
carne_animal = {
    'bacon': 'porco',
    'carne de porco desfiada': 'porco',
    'pastrami': 'vaca',
    'carne enlatada': 'vaca',
    'presunto': 'porco',
    'nova lox': 'salmão'
    }
carne_animal

{'bacon': 'porco',
 'carne de porco desfiada': 'porco',
 'pastrami': 'vaca',
 'carne enlatada': 'vaca',
 'presunto': 'porco',
 'nova lox': 'salmão'}

In [37]:
letra_minuscula = carnes['comida'].str.lower()
letra_minuscula

0                      bacon
1    carne de porco desfiada
2                      bacon
3                   pastrami
4             carne enlatada
5                      bacon
6                   pastrami
7                   presunto
8                   nova lox
Name: comida, dtype: object

In [38]:
carnes['animal'] = letra_minuscula.map(carne_animal)
carnes

Unnamed: 0,comida,gramas,animal
0,bacon,100,porco
1,carne de porco desfiada,75,porco
2,bacon,300,porco
3,Pastrami,150,vaca
4,carne enlatada,190,vaca
5,Bacon,200,porco
6,pastrami,75,vaca
7,presunto,125,porco
8,nova lox,150,salmão


In [39]:
carnes['comida'].map(lambda x: carne_animal[x.lower()])

0     porco
1     porco
2     porco
3      vaca
4      vaca
5     porco
6      vaca
7     porco
8    salmão
Name: comida, dtype: object

In [40]:
carnes


Unnamed: 0,comida,gramas,animal
0,bacon,100,porco
1,carne de porco desfiada,75,porco
2,bacon,300,porco
3,Pastrami,150,vaca
4,carne enlatada,190,vaca
5,Bacon,200,porco
6,pastrami,75,vaca
7,presunto,125,porco
8,nova lox,150,salmão


*map pode ser usado para modificar um subconjunto de valores em um objeto, porém replace oferece uma forma mais simples e mais flexível de fazer isso*

In [41]:
dados = pd.Series([1., -999., 2., -999., -1000., 3.])
dados

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [42]:
dados.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [43]:
dados.replace([-999,-1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [44]:
dados.replace([-999,-1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [45]:
dados.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [46]:
dados

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [47]:
dados.replace([-999,-1000], np.nan, inplace=True)  # lembrando que inplace muda a variavel

In [48]:
dados

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [49]:
dados = pd.DataFrame(np.arange(12). reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])
dados

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [50]:
transformar = lambda x: x[:4].upper()
dados.index.map(transformar)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [51]:
dados.index = dados.index.map(transformar)
dados

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [52]:
dados.rename(index=str.title, columns=str.upper)  # versão transformada de um conjunto de dados sem modificar os dados originais

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [53]:
dados

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [54]:
dados.rename(index={'OHIO':'INDIANA'}, columns={'three': 'peekaboo'})  # rename evita que você tenha o trabalho de copiar o DataFrame manualmente e definir seus atributos index e columns

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [55]:
dados.rename(index={'OHIO': 'INDIANA'}, inplace=True)
dados

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


###**Discretização e compartimentalização (binning)**

In [56]:
idades = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
idades

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [57]:
bins = [18, 25, 35, 60, 100]
categorias = pd.cut(idades, bins)
categorias

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [58]:
categorias.codes  # informa qual o numero da categoria

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [59]:
categorias.categories  # closed = 'right', dtype='interval[int64]'

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [60]:
pd.value_counts(categorias)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [61]:
pd.cut(idades,[18, 26, 36, 61, 100], right=False)  # convertendo em um conjunto aberto a direita

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [62]:
fases = ['Juventude', 'Jovem adulto', 'meia idade', 'senhor']
pd.cut(idades, bins, labels=fases)

['Juventude', 'Juventude', 'Juventude', 'Jovem adulto', 'Juventude', ..., 'Jovem adulto', 'senhor', 'meia idade', 'meia idade', 'Jovem adulto']
Length: 12
Categories (4, object): ['Juventude' < 'Jovem adulto' < 'meia idade' < 'senhor']

In [63]:
dados = np.random.rand(20)
pd.cut(dados, 4, precision=2)  # precision=2 limita a precisão decimal em dois dígitos

[(0.73, 0.97], (0.25, 0.49], (0.25, 0.49], (0.25, 0.49], (0.73, 0.97], ..., (0.49, 0.73], (0.0079, 0.25], (0.73, 0.97], (0.25, 0.49], (0.0079, 0.25]]
Length: 20
Categories (4, interval[float64, right]): [(0.0079, 0.25] < (0.25, 0.49] < (0.49, 0.73] <
                                           (0.73, 0.97]]

In [64]:
dados = np.random.randn(1000)  # normalmente distribuidos
categorias = pd.qcut(dados, 4)  # separa em quantis
categorias

[(-0.729, -0.0694], (-0.729, -0.0694], (-0.0694, 0.613], (0.613, 3.278], (-0.729, -0.0694], ..., (-3.05, -0.729], (0.613, 3.278], (0.613, 3.278], (-3.05, -0.729], (-0.729, -0.0694]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.05, -0.729] < (-0.729, -0.0694] < (-0.0694, 0.613] <
                                           (0.613, 3.278]]

In [65]:
pd.value_counts(categorias)

(-3.05, -0.729]      250
(-0.729, -0.0694]    250
(-0.0694, 0.613]     250
(0.613, 3.278]       250
dtype: int64

In [66]:
categorias2 = pd.qcut(dados, [0, 0.1, 0.5, 0.9, 1.])  # mumeros de 0 a 1
categorias2

[(-1.289, -0.0694], (-1.289, -0.0694], (-0.0694, 1.178], (-0.0694, 1.178], (-1.289, -0.0694], ..., (-3.05, -1.289], (-0.0694, 1.178], (1.178, 3.278], (-1.289, -0.0694], (-1.289, -0.0694]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.05, -1.289] < (-1.289, -0.0694] < (-0.0694, 1.178] <
                                           (1.178, 3.278]]

In [67]:
pd.value_counts(categorias2)

(-1.289, -0.0694]    400
(-0.0694, 1.178]     400
(-3.05, -1.289]      100
(1.178, 3.278]       100
dtype: int64

###**Detectando e filtrando valores discrepantes**

In [68]:
dados = pd.DataFrame(np.random.randn(1000, 4))
dados

Unnamed: 0,0,1,2,3
0,0.287415,0.371795,-1.773157,0.903367
1,1.040131,0.002199,-0.083551,2.122827
2,-1.025510,1.356535,-0.232790,-1.128460
3,0.492139,0.074940,-0.045513,0.791231
4,1.812164,0.003280,0.516081,1.556591
...,...,...,...,...
995,-2.040064,-0.770742,0.367707,0.104152
996,-0.722033,0.175820,-0.201095,-0.585682
997,-1.490150,-1.194017,-0.537466,0.940081
998,0.407113,2.324387,0.289758,-2.019405


In [69]:
dados.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.039821,0.011674,-0.022482,0.030571
std,1.037471,1.0123,0.983261,1.005212
min,-3.410249,-3.307734,-3.418157,-3.504881
25%,-0.748791,-0.617053,-0.701091,-0.686849
50%,-0.012956,0.012395,0.009673,0.03869
75%,0.638612,0.647134,0.630116,0.700559
max,3.186505,3.771556,3.384295,3.81455


In [70]:
col2 = dados[2]
col2

0     -1.773157
1     -0.083551
2     -0.232790
3     -0.045513
4      0.516081
         ...   
995    0.367707
996   -0.201095
997   -0.537466
998    0.289758
999    0.765856
Name: 2, Length: 1000, dtype: float64

In [71]:
col2[np.abs(col2)>3]  # verificando valores maiores que 3

301    3.384295
970   -3.418157
Name: 2, dtype: float64

In [72]:
dados[(np.abs(dados)>3).any(1)]  # para selecionar linhas que contenham um valor maior que 3

Unnamed: 0,0,1,2,3
301,1.389229,-1.571685,3.384295,0.84089
307,-1.025054,0.386146,-0.763109,-3.504881
459,0.557861,-3.307734,-0.02809,1.427935
492,3.186505,0.960164,0.868864,1.657614
577,-0.283946,0.153032,0.028949,3.089497
609,-3.410249,-2.958783,0.347037,-0.346547
679,0.444162,3.220214,-1.508271,1.543624
710,-3.107541,-0.563158,1.529744,-0.969733
781,-0.41214,3.771556,-1.044191,-0.331782
838,-0.960829,-0.89122,1.07211,3.81455


In [73]:
dados[np.abs(dados)>3] = np.sign(dados)*3  # np.sign(dados) gera valores 1 e –1 com base no fato de os valores em data serem positivos ou negativos
dados.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.03949,0.01099,-0.022448,0.030172
std,1.035339,1.008079,0.980644,1.000543
min,-3.0,-3.0,-3.0,-3.0
25%,-0.748791,-0.617053,-0.701091,-0.686849
50%,-0.012956,0.012395,0.009673,0.03869
75%,0.638612,0.647134,0.630116,0.700559
max,3.0,3.0,3.0,3.0


In [74]:
np.sign(dados).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,1.0
1,1.0,1.0,-1.0,1.0
2,-1.0,1.0,-1.0,-1.0
3,1.0,1.0,-1.0,1.0
4,1.0,1.0,1.0,1.0


###**Permutação (reordenar aleatoriamente) e amostragem aleatória**

In [75]:
df = pd.DataFrame(np.arange(5*4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [76]:
amostrador = np.random.permutation(5)  # Chamar permutation com o tamanho do eixo que você quer permutar gera um array de inteiros informando a nova ordem
amostrador

array([2, 0, 4, 3, 1])

In [77]:
df.take(amostrador)

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
1,4,5,6,7


In [78]:
df.sample(n=3)  # selecionando um subconjunto aleatóriamente s/ repetições

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
1,4,5,6,7


In [79]:
df.sample(n=10, replace=True)  # selecionando um subconjunto aleatóriamente c/ repetições

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
1,4,5,6,7
1,4,5,6,7
2,8,9,10,11
2,8,9,10,11
2,8,9,10,11
2,8,9,10,11
0,0,1,2,3
0,0,1,2,3


In [80]:
escolha = pd.Series([5, 7, -1, 6, 4])
aleatorio = escolha.sample(n=10, replace=True)
aleatorio

4    4
1    7
0    5
3    6
3    6
1    7
3    6
3    6
4    4
2   -1
dtype: int64

###**Calculando variáveis indicadoras/dummy**

In [81]:
df = pd.DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'b'],
                   'dados1': range(6)})
df

Unnamed: 0,key,dados1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [82]:
pd.get_dummies(df['key'])  # foi gerada uma matriz baseada nos valores das chaves informando em qual linha está cada item da chave

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [83]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [84]:
df_com_dummies = df[['dados1']].join(dummies)
df_com_dummies

Unnamed: 0,dados1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [86]:
mnomes = ['filme_id', 'titulo', 'genero']
filmes = pd.read_table('datasets/movielens/movies.dat', sep='::', header= None, names=mnomes)
filmes

Unnamed: 0,filme_id,titulo,genero
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [87]:
todos_generos = []
for x in filmes.genero:
    todos_generos.extend(x.split('|'))
generos = pd.unique(todos_generos)
generos

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [88]:
matriz_zeros = np.zeros((len(filmes), len(generos)))
matriz_zeros

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [89]:
dummies = pd.DataFrame(matriz_zeros, columns=generos)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
gen = filmes.genero[0]
gen

"Animation|Children's|Comedy"

In [91]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [92]:
dummies.columns.get_indexer(gen.split('|'))


array([0, 1, 2])

In [93]:
for i, gen in enumerate(filmes.genero):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

filmes_windices = filmes.join(dummies.add_prefix('Gnero_'))
filmes_windices

Unnamed: 0,filme_id,titulo,genero,Gnero_Animation,Gnero_Children's,Gnero_Comedy,Gnero_Adventure,Gnero_Fantasy,Gnero_Romance,Gnero_Drama,...,Gnero_Crime,Gnero_Thriller,Gnero_Horror,Gnero_Sci-Fi,Gnero_Documentary,Gnero_War,Gnero_Musical,Gnero_Mystery,Gnero_Film-Noir,Gnero_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,3949,Requiem for a Dream (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,3950,Tigerland (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,3951,Two Family House (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
filmes_windices.iloc[0]

filme_id                                       1
titulo                          Toy Story (1995)
genero               Animation|Children's|Comedy
Gnero_Animation                              1.0
Gnero_Children's                             1.0
Gnero_Comedy                                 1.0
Gnero_Adventure                              0.0
Gnero_Fantasy                                0.0
Gnero_Romance                                0.0
Gnero_Drama                                  0.0
Gnero_Action                                 0.0
Gnero_Crime                                  0.0
Gnero_Thriller                               0.0
Gnero_Horror                                 0.0
Gnero_Sci-Fi                                 0.0
Gnero_Documentary                            0.0
Gnero_War                                    0.0
Gnero_Musical                                0.0
Gnero_Mystery                                0.0
Gnero_Film-Noir                              0.0
Gnero_Western       

In [97]:
np.random.seed(12345)

In [99]:
valores = np.random.rand(10)
valores

array([0.74771481, 0.96130674, 0.0083883 , 0.10644438, 0.29870371,
       0.65641118, 0.80981255, 0.87217591, 0.9646476 , 0.72368535])

In [101]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]  # Uma receita útil para aplicações estatísticas é combinar get_dummies com uma função de discretizaçã como cut
pd.get_dummies(pd.cut(valores, bins))  # o indice é a posição do array respectivamente

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,1,0
1,0,0,0,0,1
2,1,0,0,0,0
3,1,0,0,0,0
4,0,1,0,0,0
5,0,0,0,1,0
6,0,0,0,0,1
7,0,0,0,0,1
8,0,0,0,0,1
9,0,0,0,1,0


#**Manipulação de strings**

###**Métodos de objetos string**

In [102]:
val = 'a,v, guido'
val.split(',')

['a', 'v', ' guido']

In [104]:
pecas = [x.strip() for x in val.split(',')]
pecas

['a', 'v', 'guido']

In [106]:
primeiro, segundo, terceiro = pecas
primeiro + '::' + segundo + '::' + terceiro

'a::v::guido'

In [107]:
'::'.join(pecas)

'a::v::guido'

In [108]:
'guido' in val

True

In [109]:
val.index(',')

1

In [113]:
val.find(':')  # devolve -1 caso não encontre o valor

-1

In [112]:
# val.index(':')  # devolve um erro

In [114]:
val.count(',')

2

In [115]:
val.replace(',', '::')

'a::v:: guido'

In [116]:
val.replace(',', '')

'av guido'

In [137]:
tabela3 = {'Argumento': ['count', 'endswith', 'startswith', 'join', 'index', 'find', 
                         'rfind', 'replace', 'strip, rstrip, lstrip', 'split', 'lower',
                         'upper', 'casefold', 'ljust , rjust'],
           'Descrição': ['Devolve o número de ocorrências de uma substring na string, sem sobreposição',
                         'Devolve True se a string terminar com o sufixo.',
                         'Devolve True se a string começar com o prefixo.',
                         'Utiliza a string como delimitadora para concatenar uma sequência de outras strings.', 
                         'Devolve a posição do primeiro caractere de uma substring, se ela for encontrada em uma string: gera ValueError se não encontrar.', 
                         'Devolve a posição do primeiro caractere da primeira ocorrência da substring na string; é como index, porém devolve –1 se não encontrar.', 
                         'Devolve a posição do primeiro caractere da última ocorrência da substring na string; devolve –1 se não encontrar.', 
                         'Substitui ocorrências de uma string por outra string.', 
                         'Remove espaços em branco, incluindo quebras de linha; é equivalente a x.strip() (e a rstrip e lstrip, respectivamente) para cada elemento.', 
                         'Separa a string em uma lista de substrings usando o delimitador especificado.', 
                         'Converte os caracteres alfabéticos para letras minúsculas.', 
                         'Converte os caracteres alfabéticos para letras maiúsculas.', 
                         'Converte os caracteres para letras minúsculas e converte quaisquer combinações variáveis de caracteres específicos de região para um formato comum comparável', 
                         'Justifica à esquerda ou à direita, respectivamente; preenche o lado oposto da string com espaços (ou com outro caractere de preenchimento) para devolver uma string com um tamanho mínimo.']}
        
    
  


In [139]:
tabela3 = pd.DataFrame(tabela3)
tabela3

Unnamed: 0,Argumento,Descrição
0,count,Devolve o número de ocorrências de uma substri...
1,endswith,Devolve True se a string terminar com o sufixo.
2,startswith,Devolve True se a string começar com o prefixo.
3,join,Utiliza a string como delimitadora para concat...
4,index,Devolve a posição do primeiro caractere de uma...
5,find,Devolve a posição do primeiro caractere da pri...
6,rfind,Devolve a posição do primeiro caractere da últ...
7,replace,Substitui ocorrências de uma string por outra ...
8,"strip, rstrip, lstrip","Remove espaços em branco, incluindo quebras de..."
9,split,Separa a string em uma lista de substrings usa...


**expressões regulares - regex**

In [140]:
import re

In [141]:
texto = 'foo bar\t bar\tqux'
re.split('\s+', texto)  # a regex que descreve um ou mais caracteres para espaços em branco é \s+

['foo', 'bar', 'bar', 'qux']

In [143]:
regex = re.compile('\s+')
regex.split(texto)

['foo', 'bar', 'bar', 'qux']

In [145]:
regex.findall(texto)  # devolve todas as correspondências em uma string

[' ', '\t ', '\t']

In [146]:
texto = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
padrao = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASE faz com que a regex não diferencie letras minúsculas de maiúsculas
regex = re.compile(padrao, flags=re.IGNORECASE)

In [152]:
regex.findall(texto) 

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

**Para evitar um escaping indesejado com \ em uma expressão regular, utilize literais de string puros como r'C:\x' no lugar do 'C:\\x' equivalente.**

In [155]:
m = regex.search(texto)  # "search" devolve apenas a primeira e "match" faz a correspondência somente no início da string
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [156]:
texto[m.start():m.end()]

'dave@google.com'

In [157]:
print(regex.match(texto))  # devolve None, pois fará a correspondência somente se o padrão ocorrer no início da string

None


In [159]:
print(regex.sub('REDACTED', texto))  # sub devolverá uma nova string com as ocorrências do padrão substituídas por uma nova string

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



*Suponha que quiséssemos encontrar os endereços de email e, simultaneamente, segmentar cada endereço em seus três componentes: nome do usuário, nome do domínio e sufixo do domínio. Para isso, coloque parênteses em torno das partes do padrão a fim de segmentá-lo*

In [162]:
padrao = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(padrao, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

In [163]:
regex.findall(texto)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [165]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', texto))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [166]:
tabela4 = {
    'Argumento': [
        'findall', 'findter', 'match', 'search', 'split', 'sub, subn'
    ],
    'Descrição': [
        'Devolve todos os padrões correspondentes em uma string, sem sobreposição, na forma de uma lista', 
        'É como findall, porém devolve um iterador',
        'Corresponde padrões no início da string e, opcionalmente, segmenta componentes do padrão em grupos; se houver uma correspondência com o padrão, devolve um objeto de correspondência; caso contrário, devolve None',
        'Pesquisa a string para verificar se há uma correspondência com o padrão; em caso afirmativo, devolve um objeto de correspondência. De modo diferente de match, a correspondência pode se dar em qualquer ponto da string, em oposição a ocorrer somente no início',
        'Separa a string em partes a cada ocorrência do padrão',
        'Substitui todas (sub) ou as n primeiras (subn) ocorrências do padrão em uma string por uma expressão substituta; utiliza os símbolos \1, \2, ... para referenciar os elementos de grupo da correspondência na string de substituição'
    ]
    
}

In [167]:
pd.DataFrame(tabela4)

Unnamed: 0,Argumento,Descrição
0,findall,Devolve todos os padrões correspondentes em um...
1,findter,"É como findall, porém devolve um iterador"
2,match,"Corresponde padrões no início da string e, opc..."
3,search,Pesquisa a string para verificar se há uma cor...
4,split,Separa a string em partes a cada ocorrência do...
5,"sub, subn",Substitui todas (sub) ou as n primeiras (subn)...


###**Funções de string vetorizadas no pandas**

In [169]:
dados = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}
dados = pd.Series(dados)
dados

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [170]:
dados.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [171]:
dados.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [186]:
padrao = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"

In [188]:
dados.str.findall(padrao, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

*Há duas maneiras para a obtenção de elementos vetorizados. Você pode usar str.get ou indexar no atributo str:*

In [191]:
matches = dados.str.findall(padrao, flags=re.IGNORECASE)
matches
matches.str.get(1)

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [194]:
matches.str[0]

Dave     (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

In [195]:
matches = matches.str[0]
matches.str.get(1)

Dave     google
Steve     gmail
Rob       gmail
Wes         NaN
dtype: object

In [197]:
dados.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [202]:
tabela5 = {
    'Método': [
        'cat', 'contains', 'count', 'extract', 'endswith', 'startswith', 'findall', 'get',
        'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isupper', 
        'join', 'len', 'lower, upper', 'match', 'pad', 'center', 'repeat', 'replace', 
        'slice', 'split', 'strip', 'rstrip', 'lstrip'],
    'Descrição': [
        'Concatena strings em todos os elementos, com um delimitador opcional',
        'Devolve um array booleano se cada string contiver um padrão/uma regex',
        'Conta as ocorrências do padrão',
        'Utiliza uma expressão regular com grupos para extrair uma ou mais strings de uma Series de strings; o resultado será um DataFrame com uma coluna por grupo',
        'Equivalente a x.endswith(pattern) para cada elemento',
        'Equivalente a x.startswith(pattern) para cada elemento',
        'Calcula uma lista com todas as ocorrências de um padrão/uma regex para cada string',
        'Indexa cada elemento (obtém o i-ésimo elemento)',
        'Equivalente ao str.isalpha embutido',
        'Equivalente ao str.isalpha embutido',
        'Equivalente ao str.isdecimal embutido',
        'Equivalente ao str.isdigit embutido',
        'Equivalente ao str.islower embutido', 
        'Equivalente ao str.isnumeric embutido',
        'Equivalente ao str.isupper embutido',
        'Junta strings em cada elemento da Series utilizando o separador especificado',
        'Calcula o tamanho de cada string',
        'Converte para letras minúsculas ou maiúsculas; equivalente a x.lower() ou a x.upper() para cada elemento',
        'Usa re.match com a expressão regular especificada em cada elemento, devolvendo os grupos com os quais houve uma correspondência, na forma de uma lista',
        'Adiciona espaços em branco à esquerda, à direita ou nos dois lados das strings',
        'Equivalente a pad(side="both")',
        'Duplica valores (por exemplo, s.str.repeat(3) é equivalente a x * 3 para cada string)',
        'Substitui ocorrências do padrão/da regex por outra string',
        'Fatia cada string da Series',
        'Separa as string no delimitador ou na expressão regular',
        'Remove espaços em branco de ambos os lados, incluindo quebras de linha',
        'Remove espaços em branco do lado direito',
        'Remove espaços em branco do lado esquerdo'
    ]

}

In [204]:
tabela5 = pd.DataFrame(tabela5)
tabela5

Unnamed: 0,Método,Descrição
0,cat,"Concatena strings em todos os elementos, com u..."
1,contains,Devolve um array booleano se cada string conti...
2,count,Conta as ocorrências do padrão
3,extract,Utiliza uma expressão regular com grupos para ...
4,endswith,Equivalente a x.endswith(pattern) para cada el...
5,startswith,Equivalente a x.startswith(pattern) para cada ...
6,findall,Calcula uma lista com todas as ocorrências de ...
7,get,Indexa cada elemento (obtém o i-ésimo elemento)
8,isalnum,Equivalente ao str.isalpha embutido
9,isalpha,Equivalente ao str.isalpha embutido
