# Data Wrangling - Limpeza de Dados

In [1]:
import pandas as pd
import numpy as np

In [2]:
datas =  pd.date_range('20200101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index = datas, columns =['var_a', 'var_b', 'var_c', 'var_d'])

In [27]:
df2 = pd.DataFrame({'a': 1.,
                   'b': pd.Timestamp('20130101'),
                   'c': pd.Series(1, index=list(range(4)), dtype='float32'),
                   'd': np.array([3]*4, dtype='int32'),
                   'e': pd.Categorical(['test', 'train', 'test', 'train']),
                   'f': 'python'})

## Sumarizando os dados

In [4]:
df.shape

(6, 4)

In [5]:
df.dtypes

var_a    float64
var_b    float64
var_c    float64
var_d    float64
dtype: object

In [6]:
# checando o segundo dataframe

df2.dtypes

a           float64
b    datetime64[ns]
c           float32
d             int32
e          category
f            object
dtype: object

In [8]:
# describe faz uma sumarização estatística das colunas com dados numéricos.

df2.describe()

Unnamed: 0,a,c,d
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


## Inserindo Coluna

In [10]:
# fazendo uma variável nova com filtro de df e inserindo uma nova coluna

df1 =  df.reindex(index=datas[0:4], columns = list(df.columns) + ['var_e'])

df1

Unnamed: 0,var_a,var_b,var_c,var_d,var_e
2020-01-01,0.210234,-0.04407,-1.110814,0.326767,
2020-01-02,0.615309,1.381729,0.270085,0.731074,
2020-01-03,0.68547,-0.396274,1.917501,-1.64369,
2020-01-04,-1.933212,0.736905,-1.176494,1.666245,


In [11]:
# inserindo valor na coluna var_e apenas em linhas selecionadas

df1.loc[datas[0]:datas[1], 'var_e'] = 77

df1

Unnamed: 0,var_a,var_b,var_c,var_d,var_e
2020-01-01,0.210234,-0.04407,-1.110814,0.326767,77.0
2020-01-02,0.615309,1.381729,0.270085,0.731074,77.0
2020-01-03,0.68547,-0.396274,1.917501,-1.64369,
2020-01-04,-1.933212,0.736905,-1.176494,1.666245,


In [12]:
df1.describe()

Unnamed: 0,var_a,var_b,var_c,var_d,var_e
count,4.0,4.0,4.0,4.0,2.0
mean,-0.10555,0.419572,-0.024931,0.270099,77.0
std,1.236314,0.79729,1.456631,1.393739,0.0
min,-1.933212,-0.396274,-1.176494,-1.64369,77.0
25%,-0.325627,-0.132121,-1.127234,-0.165847,77.0
50%,0.412772,0.346418,-0.420365,0.528921,77.0
75%,0.632849,0.898111,0.681939,0.964867,77.0
max,0.68547,1.381729,1.917501,1.666245,77.0


## Dados Missing

In [13]:
datas =  pd.date_range('20190101', periods = 60, freq='D')

df = pd.DataFrame(np.random.randn(60,5), index = datas, columns = list('ABCDE'))

In [14]:
df

Unnamed: 0,A,B,C,D,E
2019-01-01,1.075284,-0.793551,-1.709712,-2.397818,-0.91316
2019-01-02,-1.745281,-0.320001,-0.746441,0.176754,1.552316
2019-01-03,-0.605726,-2.10541,-1.250606,0.530297,1.207477
2019-01-04,0.554439,-0.077217,0.453242,0.173907,-0.241933
2019-01-05,0.082602,1.115591,0.243824,-1.798037,-1.859437
2019-01-06,-0.382627,0.622202,0.648539,0.83344,0.649026
2019-01-07,0.362207,0.229959,-0.035647,0.46765,-0.506128
2019-01-08,-0.805466,0.508145,-2.756733,-0.198599,0.178525
2019-01-09,-0.311128,0.074136,-0.299527,1.354156,0.265027
2019-01-10,-0.349263,-0.060862,0.930853,-0.660917,1.217448


In [15]:
df['F'] = df.A[df.A >0]

In [16]:
df.head()

Unnamed: 0,A,B,C,D,E,F
2019-01-01,1.075284,-0.793551,-1.709712,-2.397818,-0.91316,1.075284
2019-01-02,-1.745281,-0.320001,-0.746441,0.176754,1.552316,
2019-01-03,-0.605726,-2.10541,-1.250606,0.530297,1.207477,
2019-01-04,0.554439,-0.077217,0.453242,0.173907,-0.241933,0.554439
2019-01-05,0.082602,1.115591,0.243824,-1.798037,-1.859437,0.082602


In [17]:
# lidando com dados missing

df2 = df.copy()

df3 = df.copy()

In [18]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
2019-01-01,1.075284,-0.793551,-1.709712,-2.397818,-0.91316,1.075284
2019-01-02,-1.745281,-0.320001,-0.746441,0.176754,1.552316,
2019-01-03,-0.605726,-2.10541,-1.250606,0.530297,1.207477,
2019-01-04,0.554439,-0.077217,0.453242,0.173907,-0.241933,0.554439
2019-01-05,0.082602,1.115591,0.243824,-1.798037,-1.859437,0.082602


In [19]:
df3.head()

Unnamed: 0,A,B,C,D,E,F
2019-01-01,1.075284,-0.793551,-1.709712,-2.397818,-0.91316,1.075284
2019-01-02,-1.745281,-0.320001,-0.746441,0.176754,1.552316,
2019-01-03,-0.605726,-2.10541,-1.250606,0.530297,1.207477,
2019-01-04,0.554439,-0.077217,0.453242,0.173907,-0.241933,0.554439
2019-01-05,0.082602,1.115591,0.243824,-1.798037,-1.859437,0.082602


In [21]:
# excluindo os valores na de uma vez - problema que ele tb causa exclusão de valores válidos das outras colunas

df2.dropna().shape

(27, 6)

In [22]:
# substituindo os valores na com a média de números da coluna a como exemplo - especificando o .F para a coluna

df3.F.fillna(np.mean(df3.A))

2019-01-01    1.075284
2019-01-02    0.007861
2019-01-03    0.007861
2019-01-04    0.554439
2019-01-05    0.082602
2019-01-06    0.007861
2019-01-07    0.362207
2019-01-08    0.007861
2019-01-09    0.007861
2019-01-10    0.007861
2019-01-11    0.962824
2019-01-12    0.007861
2019-01-13    1.242645
2019-01-14    0.007861
2019-01-15    0.870252
2019-01-16    0.007861
2019-01-17    0.007861
2019-01-18    1.407822
2019-01-19    0.007861
2019-01-20    0.007861
2019-01-21    1.683031
2019-01-22    0.462587
2019-01-23    0.174253
2019-01-24    0.007861
2019-01-25    0.007861
2019-01-26    0.212279
2019-01-27    0.007861
2019-01-28    0.948198
2019-01-29    0.912892
2019-01-30    1.096362
2019-01-31    1.146992
2019-02-01    0.360422
2019-02-02    0.007861
2019-02-03    0.007861
2019-02-04    0.007861
2019-02-05    0.279629
2019-02-06    0.007861
2019-02-07    0.007861
2019-02-08    0.007861
2019-02-09    0.651256
2019-02-10    1.561398
2019-02-11    0.007861
2019-02-12    0.288271
2019-02-13 

In [24]:
df4 = df.copy()

In [25]:
# fazendo outro tipo de adição

df4.fillna(value = 77)

Unnamed: 0,A,B,C,D,E,F
2019-01-01,1.075284,-0.793551,-1.709712,-2.397818,-0.91316,1.075284
2019-01-02,-1.745281,-0.320001,-0.746441,0.176754,1.552316,77.0
2019-01-03,-0.605726,-2.10541,-1.250606,0.530297,1.207477,77.0
2019-01-04,0.554439,-0.077217,0.453242,0.173907,-0.241933,0.554439
2019-01-05,0.082602,1.115591,0.243824,-1.798037,-1.859437,0.082602
2019-01-06,-0.382627,0.622202,0.648539,0.83344,0.649026,77.0
2019-01-07,0.362207,0.229959,-0.035647,0.46765,-0.506128,0.362207
2019-01-08,-0.805466,0.508145,-2.756733,-0.198599,0.178525,77.0
2019-01-09,-0.311128,0.074136,-0.299527,1.354156,0.265027,77.0
2019-01-10,-0.349263,-0.060862,0.930853,-0.660917,1.217448,77.0


##  Verificando Dados Repetidos

In [28]:
df2

Unnamed: 0,a,b,c,d,e,f
0,1.0,2013-01-01,1.0,3,test,python
1,1.0,2013-01-01,1.0,3,train,python
2,1.0,2013-01-01,1.0,3,test,python
3,1.0,2013-01-01,1.0,3,train,python


In [29]:
df2.nunique()

a    1
b    1
c    1
d    1
e    2
f    1
dtype: int64

In [30]:
df2.nunique(axis=1, dropna=False)

0    5
1    5
2    5
3    5
dtype: int64

## Removendo Valores Duplicados

In [32]:
df2.drop_duplicates()

Unnamed: 0,a,b,c,d,e,f
0,1.0,2013-01-01,1.0,3,test,python
1,1.0,2013-01-01,1.0,3,train,python
