# Data Wrangling - Limpeza de Dados

In [1]:
import pandas as pd
import numpy as np

In [2]:
datas =  pd.date_range('20200101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index = datas, columns =['var_a', 'var_b', 'var_c', 'var_d'])

In [26]:
df2 = pd.DataFrame({'a': 1.,
                   'b': pd.Timestamp('20130101'),
                   'c': pd.Series(1, index=list(range(4)), dtype='float32'),
                   'd': np.array([3]*4, dtype='int32'),
                   'e': pd.Categorical(['test', 'train', 'test', 'train']),
                   'f': 'python'})

## Sumarizando os dados

In [4]:
df.shape

(6, 4)

In [5]:
df.dtypes

var_a    float64
var_b    float64
var_c    float64
var_d    float64
dtype: object

In [6]:
# checando o segundo dataframe

df2.dtypes

a           float64
b    datetime64[ns]
c           float32
d             int32
e          category
f            object
dtype: object

In [7]:
# describe faz uma sumarização estatística das colunas com dados numéricos.

df2.describe()

Unnamed: 0,a,c,d
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


## Inserindo Coluna

In [8]:
# fazendo uma variável nova com filtro de df e inserindo uma nova coluna

df1 =  df.reindex(index=datas[0:4], columns = list(df.columns) + ['var_e'])

df1

Unnamed: 0,var_a,var_b,var_c,var_d,var_e
2020-01-01,0.83715,0.57568,-1.277435,-1.405224,
2020-01-02,0.69377,-0.441988,0.251462,-0.543267,
2020-01-03,-0.257103,-0.148468,0.423664,-2.206557,
2020-01-04,1.132136,0.150439,-1.513511,1.160941,


In [9]:
# inserindo valor na coluna var_e apenas em linhas selecionadas

df1.loc[datas[0]:datas[1], 'var_e'] = 77

df1

Unnamed: 0,var_a,var_b,var_c,var_d,var_e
2020-01-01,0.83715,0.57568,-1.277435,-1.405224,77.0
2020-01-02,0.69377,-0.441988,0.251462,-0.543267,77.0
2020-01-03,-0.257103,-0.148468,0.423664,-2.206557,
2020-01-04,1.132136,0.150439,-1.513511,1.160941,


In [10]:
df1.describe()

Unnamed: 0,var_a,var_b,var_c,var_d,var_e
count,4.0,4.0,4.0,4.0,2.0
mean,0.601488,0.033916,-0.528955,-0.748527,77.0
std,0.600782,0.434678,1.007655,1.442833,0.0
min,-0.257103,-0.441988,-1.513511,-2.206557,77.0
25%,0.456052,-0.221848,-1.336454,-1.605558,77.0
50%,0.76546,0.000985,-0.512987,-0.974246,77.0
75%,0.910897,0.256749,0.294512,-0.117215,77.0
max,1.132136,0.57568,0.423664,1.160941,77.0


## Dados Missing

In [11]:
datas =  pd.date_range('20190101', periods = 60, freq='D')

df = pd.DataFrame(np.random.randn(60,5), index = datas, columns = list('ABCDE'))

In [12]:
df

Unnamed: 0,A,B,C,D,E
2019-01-01,0.952828,0.73847,0.172297,0.103323,-0.209966
2019-01-02,0.066072,-0.909778,-0.08134,0.251921,-0.531552
2019-01-03,0.311269,0.246644,-0.300109,0.668311,-0.404466
2019-01-04,0.702596,-1.013777,1.068209,-0.884105,-0.480052
2019-01-05,-0.536843,0.31059,1.737362,1.317403,-0.485927
2019-01-06,0.197492,0.429251,0.902359,0.137981,-0.261896
2019-01-07,-0.86918,-0.07271,0.411885,1.433071,-0.299928
2019-01-08,1.027443,0.930743,1.122204,-1.232258,-0.144155
2019-01-09,0.194187,-0.174199,-0.931022,0.459753,0.854749
2019-01-10,0.392707,0.005753,-1.290356,-1.659183,1.176821


In [13]:
df['F'] = df.A[df.A >0]

In [14]:
df.head()

Unnamed: 0,A,B,C,D,E,F
2019-01-01,0.952828,0.73847,0.172297,0.103323,-0.209966,0.952828
2019-01-02,0.066072,-0.909778,-0.08134,0.251921,-0.531552,0.066072
2019-01-03,0.311269,0.246644,-0.300109,0.668311,-0.404466,0.311269
2019-01-04,0.702596,-1.013777,1.068209,-0.884105,-0.480052,0.702596
2019-01-05,-0.536843,0.31059,1.737362,1.317403,-0.485927,


In [15]:
# lidando com dados missing

df2 = df.copy()

df3 = df.copy()

In [16]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
2019-01-01,0.952828,0.73847,0.172297,0.103323,-0.209966,0.952828
2019-01-02,0.066072,-0.909778,-0.08134,0.251921,-0.531552,0.066072
2019-01-03,0.311269,0.246644,-0.300109,0.668311,-0.404466,0.311269
2019-01-04,0.702596,-1.013777,1.068209,-0.884105,-0.480052,0.702596
2019-01-05,-0.536843,0.31059,1.737362,1.317403,-0.485927,


In [17]:
df3.head()

Unnamed: 0,A,B,C,D,E,F
2019-01-01,0.952828,0.73847,0.172297,0.103323,-0.209966,0.952828
2019-01-02,0.066072,-0.909778,-0.08134,0.251921,-0.531552,0.066072
2019-01-03,0.311269,0.246644,-0.300109,0.668311,-0.404466,0.311269
2019-01-04,0.702596,-1.013777,1.068209,-0.884105,-0.480052,0.702596
2019-01-05,-0.536843,0.31059,1.737362,1.317403,-0.485927,


In [18]:
# excluindo os valores na de uma vez - problema que ele tb causa exclusão de valores válidos das outras colunas

df2.dropna().shape

(39, 6)

In [19]:
# substituindo os valores na com a média de números da coluna a como exemplo - especificando o .F para a coluna

df3.F.fillna(np.mean(df3.A))

2019-01-01    0.952828
2019-01-02    0.066072
2019-01-03    0.311269
2019-01-04    0.702596
2019-01-05    0.202249
2019-01-06    0.197492
2019-01-07    0.202249
2019-01-08    1.027443
2019-01-09    0.194187
2019-01-10    0.392707
2019-01-11    0.202249
2019-01-12    1.603492
2019-01-13    0.957849
2019-01-14    0.202249
2019-01-15    0.105757
2019-01-16    1.031080
2019-01-17    0.202249
2019-01-18    0.483686
2019-01-19    0.202249
2019-01-20    0.202249
2019-01-21    0.468318
2019-01-22    0.202249
2019-01-23    0.202249
2019-01-24    0.917304
2019-01-25    0.202249
2019-01-26    0.202249
2019-01-27    0.202249
2019-01-28    0.655218
2019-01-29    0.202249
2019-01-30    0.875757
2019-01-31    0.908333
2019-02-01    0.202249
2019-02-02    0.202249
2019-02-03    1.276756
2019-02-04    0.032208
2019-02-05    0.394418
2019-02-06    0.202249
2019-02-07    0.376828
2019-02-08    0.202249
2019-02-09    0.798573
2019-02-10    0.202249
2019-02-11    0.361342
2019-02-12    0.202249
2019-02-13 

In [20]:
df4 = df.copy()

In [21]:
# fazendo outro tipo de adição

df4.fillna(value = 77)

Unnamed: 0,A,B,C,D,E,F
2019-01-01,0.952828,0.73847,0.172297,0.103323,-0.209966,0.952828
2019-01-02,0.066072,-0.909778,-0.08134,0.251921,-0.531552,0.066072
2019-01-03,0.311269,0.246644,-0.300109,0.668311,-0.404466,0.311269
2019-01-04,0.702596,-1.013777,1.068209,-0.884105,-0.480052,0.702596
2019-01-05,-0.536843,0.31059,1.737362,1.317403,-0.485927,77.0
2019-01-06,0.197492,0.429251,0.902359,0.137981,-0.261896,0.197492
2019-01-07,-0.86918,-0.07271,0.411885,1.433071,-0.299928,77.0
2019-01-08,1.027443,0.930743,1.122204,-1.232258,-0.144155,1.027443
2019-01-09,0.194187,-0.174199,-0.931022,0.459753,0.854749,0.194187
2019-01-10,0.392707,0.005753,-1.290356,-1.659183,1.176821,0.392707


##  Verificando Dados Repetidos

In [27]:
df2

Unnamed: 0,a,b,c,d,e,f
0,1.0,2013-01-01,1.0,3,test,python
1,1.0,2013-01-01,1.0,3,train,python
2,1.0,2013-01-01,1.0,3,test,python
3,1.0,2013-01-01,1.0,3,train,python


In [28]:
df2.nunique()

a    1
b    1
c    1
d    1
e    2
f    1
dtype: int64

In [29]:
df2.nunique(axis=1, dropna=False)

0    5
1    5
2    5
3    5
dtype: int64

## Removendo Valores Duplicados

In [30]:
df2.drop_duplicates()

Unnamed: 0,a,b,c,d,e,f
0,1.0,2013-01-01,1.0,3,test,python
1,1.0,2013-01-01,1.0,3,train,python


## Ordenando os Dados

In [31]:
df6 = pd.DataFrame({'col1':['a', 'a', 'b', np.nan, 'd', 'c'], 'col2':[2,1,9,8,7,4], 'col3':[0,1,9,4,2,3]})

In [32]:
df6

Unnamed: 0,col1,col2,col3
0,a,2,0
1,a,1,1
2,b,9,9
3,,8,4
4,d,7,2
5,c,4,3


In [33]:
# ordenação

df6.sort_values(by='col1')

Unnamed: 0,col1,col2,col3
0,a,2,0
1,a,1,1
2,b,9,9
5,c,4,3
4,d,7,2
3,,8,4


In [37]:
# ordenando por múltiplas colunas - decrescente

df6.sort_values(by=['col3', 'col1'], ascending=False)

Unnamed: 0,col1,col2,col3
2,b,9,9
3,,8,4
5,c,4,3
4,d,7,2
1,a,1,1
0,a,2,0
