Data Cleasing

Step 1: Handle missing values (say NA or NAN) using Pandas

In [None]:
#import the pandas library
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c','e', 'f', 'h'], columns =['one', 'two', 'three'])

df = df.reindex([ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print(df)

        one       two     three
a -0.710835  0.273192  0.177848
b       NaN       NaN       NaN
c  1.317657 -0.705674  0.931030
d       NaN       NaN       NaN
e -0.704313 -0.486047 -0.803135
f  0.607179  1.159336  0.806775
g       NaN       NaN       NaN
h -0.667846 -0.424706 -1.999616


Step 2: Check for Missing Values

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], columns =['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'h' ])

print(df['one'].isnull())

a    False
b     True
c    False
d     True
e    False
f    False
h    False
Name: one, dtype: bool


Step 3: Replace NaN with a Scalar Value: The following program shows how you can replace "NaN" with "0".

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'], columns = ['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c'])
print("NaN replaced with '0': ")
print(df.fillna(0))



        one       two     three      four
a -1.928886 -0.016205 -0.528905 -0.677917
b       NaN       NaN       NaN       NaN
c  0.277407 -1.845684  1.037039  0.373068
NaN replaced with '0': 
        one       two     three      four
a -1.928886 -0.016205 -0.528905 -0.677917
b  0.000000  0.000000  0.000000  0.000000
c  0.277407 -1.845684  1.037039  0.373068


Step 4: Fill NA Forward and Backward

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index = ['a', 'c', 'e', 'f', 'h'], columns= ['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd','e', 'f', 'g', 'h'])

print(df.fillna(method = 'pad'))


        one       two     three
a  0.331312  0.297102 -1.558979
b  0.331312  0.297102 -1.558979
c -1.454208  0.638001  0.626375
d -1.454208  0.638001  0.626375
e  1.152495  0.434369  1.582584
f -0.146963  0.662048  0.408827
g -0.146963  0.662048  0.408827
h  2.351062 -0.085140 -0.695948


  print(df.fillna(method = 'pad'))


Step 5: Drop Missing Values

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], columns = ['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df.dropna())



        one       two     three
a -2.114640 -2.248270 -1.359405
c -1.208584 -0.128246 -1.085581
e -1.173840 -1.618038 -1.155815
f -0.079230 -1.917348 -1.228620
h  0.337675  0.483277  1.351448


Step 6: Replace Missing (or) Generic Values

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'one': [10,20,30,40,50,2000], 'two' :[1000,0,30,40,50,60]})
print(df.replace({1000:10,2000:60}))


   one  two
0   10   10
1   20    0
2   30   30
3   40   40
4   50   50
5   60   60
