# Handle Missing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = ['Apple', 'Google', np.nan, 'Facebook']
s1 = pd.Series(data)

In [3]:
print(s1)

0       Apple
1      Google
2         NaN
3    Facebook
dtype: object


In [4]:
s1.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
s1.dropna()

0       Apple
1      Google
3    Facebook
dtype: object

In [6]:
s1[s1.notnull()]

0       Apple
1      Google
3    Facebook
dtype: object

In [7]:
df1 = pd.DataFrame([[2, 3, 35],
                   [54, np.nan, np.nan],
                   [np.nan, np.nan, np.nan],
                   [np.nan, 3, 6]])

In [8]:
df1

Unnamed: 0,0,1,2
0,2.0,3.0,35.0
1,54.0,,
2,,,
3,,3.0,6.0


In [9]:
df1.dropna()

Unnamed: 0,0,1,2
0,2.0,3.0,35.0


In [10]:
df1.dropna(how='all')

Unnamed: 0,0,1,2
0,2.0,3.0,35.0
1,54.0,,
3,,3.0,6.0


In [11]:
df2 = pd.DataFrame(np.random.randn(7, 5))

In [12]:
df2

Unnamed: 0,0,1,2,3,4
0,0.928978,-0.037131,-0.058433,-0.161708,0.730134
1,1.439175,-0.560314,0.811599,-0.531053,-0.517902
2,1.137454,-0.147811,1.334514,0.707902,0.947469
3,0.804225,0.311968,-0.02888,1.370988,-0.369438
4,1.279717,0.021012,-0.704189,1.011243,-0.193544
5,-1.689263,-0.18188,-0.211029,-0.894305,-0.999903
6,-0.563326,-0.584646,-0.674615,-0.089197,-0.22327


In [13]:
df2.iloc[:4, 1] = df2.iloc[:3, 3] = np.nan
df2

Unnamed: 0,0,1,2,3,4
0,0.928978,,-0.058433,,0.730134
1,1.439175,,0.811599,,-0.517902
2,1.137454,,1.334514,,0.947469
3,0.804225,,-0.02888,1.370988,-0.369438
4,1.279717,0.021012,-0.704189,1.011243,-0.193544
5,-1.689263,-0.18188,-0.211029,-0.894305,-0.999903
6,-0.563326,-0.584646,-0.674615,-0.089197,-0.22327


In [14]:
df2.fillna(0)

Unnamed: 0,0,1,2,3,4
0,0.928978,0.0,-0.058433,0.0,0.730134
1,1.439175,0.0,0.811599,0.0,-0.517902
2,1.137454,0.0,1.334514,0.0,0.947469
3,0.804225,0.0,-0.02888,1.370988,-0.369438
4,1.279717,0.021012,-0.704189,1.011243,-0.193544
5,-1.689263,-0.18188,-0.211029,-0.894305,-0.999903
6,-0.563326,-0.584646,-0.674615,-0.089197,-0.22327


In [15]:
df2.fillna(df2.mean())

Unnamed: 0,0,1,2,3,4
0,0.928978,-0.248505,-0.058433,0.349682,0.730134
1,1.439175,-0.248505,0.811599,0.349682,-0.517902
2,1.137454,-0.248505,1.334514,0.349682,0.947469
3,0.804225,-0.248505,-0.02888,1.370988,-0.369438
4,1.279717,0.021012,-0.704189,1.011243,-0.193544
5,-1.689263,-0.18188,-0.211029,-0.894305,-0.999903
6,-0.563326,-0.584646,-0.674615,-0.089197,-0.22327


In [16]:
df2.mean()

0    0.476709
1   -0.248505
2    0.066995
3    0.349682
4   -0.089494
dtype: float64

# Remove Duplicates

In [17]:
df3 = pd.DataFrame({'c1': ['Three', 'One'] * 3 + ['One'],
                    'c2': [1, 1, 2, 3, 3, 4, 4]})

In [18]:
df3

Unnamed: 0,c1,c2
0,Three,1
1,One,1
2,Three,2
3,One,3
4,Three,3
5,One,4
6,One,4


In [19]:
df3.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [20]:
df3.drop_duplicates()

Unnamed: 0,c1,c2
0,Three,1
1,One,1
2,Three,2
3,One,3
4,Three,3
5,One,4


In [21]:
df3.drop_duplicates(['c1'])

Unnamed: 0,c1,c2
0,Three,1
1,One,1


In [22]:
df3.drop_duplicates(['c2'])

Unnamed: 0,c1,c2
0,Three,1
2,Three,2
3,One,3
5,One,4


In [23]:
s2 = pd.Series([3.14, -999, 2.87, -999, -1000, 3, 100])

In [24]:
s2

0       3.14
1    -999.00
2       2.87
3    -999.00
4   -1000.00
5       3.00
6     100.00
dtype: float64

In [25]:
s2.replace(-999, 0)

0       3.14
1       0.00
2       2.87
3       0.00
4   -1000.00
5       3.00
6     100.00
dtype: float64

In [26]:
s2.replace([-999, -1000], 0)

0      3.14
1      0.00
2      2.87
3      0.00
4      0.00
5      3.00
6    100.00
dtype: float64

In [27]:
s2.replace([-999, -1000], [0, np.nan])

0      3.14
1      0.00
2      2.87
3      0.00
4       NaN
5      3.00
6    100.00
dtype: float64

# String manipulation

In [28]:
test_string = 'a,b, test'

In [29]:
test_string.split()

['a,b,', 'test']

In [30]:
test_string.split(',')

['a', 'b', ' test']

In [31]:
var = [x.strip() for x in test_string.split(',')]

In [32]:
var

['a', 'b', 'test']

In [33]:
x, y, z = var

In [34]:
x + ':' + y + ':' + z

'a:b:test'

In [35]:
':'.join(var)

'a:b:test'