In [1]:
import pandas as pd
import numpy as np

### Missing data in pandas series and dataframes

In [2]:
df = pd.DataFrame(np.random.randn(5,3), index= ['a','c','e','f','h'], columns=['one', 'two', 'three']);df

Unnamed: 0,one,two,three
a,0.587492,0.121135,-0.708097
c,-0.88425,0.295378,1.614362
e,-0.253255,0.037605,1.788267
f,0.388858,-0.763031,0.096082
h,-0.341137,0.088779,-0.108117


In [3]:
df['four'] = 'bar'

In [4]:
df['five'] =  df['one'] > 0

In [5]:
df

Unnamed: 0,one,two,three,four,five
a,-0.651231,-0.229255,-1.476471,bar,False
c,0.817945,0.501585,1.498218,bar,True
e,0.129308,-0.272942,-0.514834,bar,True
f,-0.935556,-0.514915,-0.263176,bar,False
h,0.525708,0.31337,-0.809645,bar,True


In [12]:
#using reindexing can introduce missing values in pandas frame..reindexing is just a way of viewing indexes
df2 = df.reindex(['a','b','c','d','e','f','g','h'])

In [13]:
df2

Unnamed: 0,one,two,three,four,five
a,0.587492,0.121135,-0.708097,bar,True
b,,,,,
c,-0.88425,0.295378,1.614362,bar,False
d,,,,,
e,-0.253255,0.037605,1.788267,bar,False
f,0.388858,-0.763031,0.096082,bar,True
g,,,,,
h,-0.341137,0.088779,-0.108117,bar,False


In [14]:
#ways to check that values are missing
pd.isnull(df2['one'])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [9]:
df2.isnull()

Unnamed: 0,one,two,three,four,five
a,False,False,False,False,False
b,True,True,True,True,True
c,False,False,False,False,False
d,True,True,True,True,True
e,False,False,False,False,False
f,False,False,False,False,False
g,True,True,True,True,True
h,False,False,False,False,False


In [10]:
df2.isnull().any() #will look column wise

one      True
two      True
three    True
four     True
five     True
dtype: bool

In [11]:
df2['four'].notnull() #looks row wise or index wise

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool

In [12]:
df['one'] = np.nan #will give nan

In [13]:
df

Unnamed: 0,one,two,three,four,five
a,,-0.229255,-1.476471,bar,False
c,,0.501585,1.498218,bar,True
e,,-0.272942,-0.514834,bar,True
f,,-0.514915,-0.263176,bar,False
h,,0.31337,-0.809645,bar,True


In [16]:
#for timestamp nat is the null
df2 = df.copy()

In [17]:
df2['Timestamp'] = pd.Timestamp('20120101')

In [18]:
df2

Unnamed: 0,one,two,three,four,five,Timestamp
a,0.587492,0.121135,-0.708097,bar,True,2012-01-01
c,-0.88425,0.295378,1.614362,bar,False,2012-01-01
e,-0.253255,0.037605,1.788267,bar,False,2012-01-01
f,0.388858,-0.763031,0.096082,bar,True,2012-01-01
h,-0.341137,0.088779,-0.108117,bar,False,2012-01-01


In [19]:
df2.loc[['a','c','h'],['one','Timestamp']] = np.nan ; df2

Unnamed: 0,one,two,three,four,five,Timestamp
a,,0.121135,-0.708097,bar,True,NaT
c,,0.295378,1.614362,bar,False,NaT
e,-0.253255,0.037605,1.788267,bar,False,2012-01-01
f,0.388858,-0.763031,0.096082,bar,True,2012-01-01
h,,0.088779,-0.108117,bar,False,NaT


In [20]:
df2.get_dtype_counts()

bool              1
datetime64[ns]    1
float64           3
object            1
dtype: int64

### Inserting *missing values*

In [21]:
df.values

array([[0.5874916008000282, 0.1211351242959761, -0.7080968823071693,
        'bar', True],
       [-0.8842497278152779, 0.2953780743483767, 1.61436181230923, 'bar',
        False],
       [-0.2532551506151949, 0.037604764985995486, 1.7882670401295733,
        'bar', False],
       [0.38885800508748586, -0.7630305311368237, 0.09608249741587235,
        'bar', True],
       [-0.34113742678463316, 0.08877915048361772, -0.10811702217959206,
        'bar', False]], dtype=object)

In [23]:
len(df)

5

In [24]:
len(df.index)

5

In [25]:
df.columns.values

array(['one', 'two', 'three', 'four', 'five'], dtype=object)

In [6]:
df['one'].sum()

-0.5022926993275919

In [7]:
df.mean(1)

a    0.250132
c    0.256373
e    0.393154
f    0.180477
h   -0.090119
dtype: float64

In [8]:
df.cumsum()

Unnamed: 0,one,two,three,four,five
a,0.587492,0.121135,-0.708097,bar,True
c,-0.296758,0.416513,0.906265,barbar,1
e,-0.550013,0.454118,2.69453,barbarbar,1
f,-0.161155,-0.308913,2.79061,barbarbarbar,2
h,-0.502293,-0.220133,2.6825,barbarbarbarbar,2


###  groupby

In [9]:
df.groupby('one').mean() #will exclude na just like r

Unnamed: 0_level_0,two,three,five
one,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.88425,0.295378,1.614362,False
-0.341137,0.088779,-0.108117,False
-0.253255,0.037605,1.788267,False
0.388858,-0.763031,0.096082,True
0.587492,0.121135,-0.708097,True


## Cleaning/Filling missing values

In [22]:
df2

Unnamed: 0,one,two,three,four,five,Timestamp
a,,0.121135,-0.708097,bar,True,NaT
c,,0.295378,1.614362,bar,False,NaT
e,-0.253255,0.037605,1.788267,bar,False,2012-01-01
f,0.388858,-0.763031,0.096082,bar,True,2012-01-01
h,,0.088779,-0.108117,bar,False,NaT


In [23]:
df2.fillna(0)

Unnamed: 0,one,two,three,four,five,Timestamp
a,0.0,0.121135,-0.708097,bar,True,1970-01-01
c,0.0,0.295378,1.614362,bar,False,1970-01-01
e,-0.253255,0.037605,1.788267,bar,False,2012-01-01
f,0.388858,-0.763031,0.096082,bar,True,2012-01-01
h,0.0,0.088779,-0.108117,bar,False,1970-01-01


In [24]:
df2['Timestamp'].fillna('missing')

a                missing
c                missing
e    2012-01-01 00:00:00
f    2012-01-01 00:00:00
h                missing
Name: Timestamp, dtype: object

In [25]:
df2

Unnamed: 0,one,two,three,four,five,Timestamp
a,,0.121135,-0.708097,bar,True,NaT
c,,0.295378,1.614362,bar,False,NaT
e,-0.253255,0.037605,1.788267,bar,False,2012-01-01
f,0.388858,-0.763031,0.096082,bar,True,2012-01-01
h,,0.088779,-0.108117,bar,False,NaT


In [32]:
df2.fillna(method='ffill')

Unnamed: 0,one,two,three,four,five,Timestamp
a,,0.121135,-0.708097,bar,True,NaT
c,,0.295378,1.614362,bar,False,NaT
e,-0.253255,0.037605,1.788267,bar,False,2012-01-01
f,0.388858,-0.763031,0.096082,bar,True,2012-01-01
h,0.388858,0.088779,-0.108117,bar,False,2012-01-01


In [33]:
df2.fillna(method='bfill')

Unnamed: 0,one,two,three,four,five,Timestamp
a,-0.253255,0.121135,-0.708097,bar,True,2012-01-01
c,-0.253255,0.295378,1.614362,bar,False,2012-01-01
e,-0.253255,0.037605,1.788267,bar,False,2012-01-01
f,0.388858,-0.763031,0.096082,bar,True,2012-01-01
h,,0.088779,-0.108117,bar,False,NaT


In [34]:
df2.fillna(method='pad')

Unnamed: 0,one,two,three,four,five,Timestamp
a,,0.121135,-0.708097,bar,True,NaT
c,,0.295378,1.614362,bar,False,NaT
e,-0.253255,0.037605,1.788267,bar,False,2012-01-01
f,0.388858,-0.763031,0.096082,bar,True,2012-01-01
h,0.388858,0.088779,-0.108117,bar,False,2012-01-01


In [35]:
df2.fillna(method = 'backfill')

Unnamed: 0,one,two,three,four,five,Timestamp
a,-0.253255,0.121135,-0.708097,bar,True,2012-01-01
c,-0.253255,0.295378,1.614362,bar,False,2012-01-01
e,-0.253255,0.037605,1.788267,bar,False,2012-01-01
f,0.388858,-0.763031,0.096082,bar,True,2012-01-01
h,,0.088779,-0.108117,bar,False,NaT


### Filling with a pandas object

In [39]:
dff = pd.DataFrame(np.random.randn(10,3), columns= list('ABC'));dff

Unnamed: 0,A,B,C
0,0.725832,1.050192,0.442671
1,0.001439,-0.609294,0.544843
2,-0.071433,-0.491755,1.387105
3,-1.291613,0.135369,-0.193475
4,-0.054933,-1.083329,-0.251776
5,0.826667,0.794593,-0.439508
6,-2.228914,0.484411,-0.493615
7,-0.269365,0.738296,0.776757
8,-0.981815,0.139727,0.294662
9,-0.296223,-0.4039,1.406658


In [40]:
dff.iloc[3:5,0] = np.nan

In [41]:
dff.iloc[4:6, 1] = np.nan

In [42]:
dff.iloc[5:8, 2] = np.nan

In [43]:
dff

Unnamed: 0,A,B,C
0,0.725832,1.050192,0.442671
1,0.001439,-0.609294,0.544843
2,-0.071433,-0.491755,1.387105
3,,0.135369,-0.193475
4,,,-0.251776
5,0.826667,,
6,-2.228914,0.484411,
7,-0.269365,0.738296,
8,-0.981815,0.139727,0.294662
9,-0.296223,-0.4039,1.406658


In [44]:
dff.fillna(dff.mean())

Unnamed: 0,A,B,C
0,0.725832,1.050192,0.442671
1,0.001439,-0.609294,0.544843
2,-0.071433,-0.491755,1.387105
3,-0.286727,0.135369,-0.193475
4,-0.286727,0.130381,-0.251776
5,0.826667,0.130381,0.51867
6,-2.228914,0.484411,0.51867
7,-0.269365,0.738296,0.51867
8,-0.981815,0.139727,0.294662
9,-0.296223,-0.4039,1.406658


In [46]:
dff.fillna(dff.mean()['B':'C']) #will exclude a

Unnamed: 0,A,B,C
0,0.725832,1.050192,0.442671
1,0.001439,-0.609294,0.544843
2,-0.071433,-0.491755,1.387105
3,,0.135369,-0.193475
4,,0.130381,-0.251776
5,0.826667,0.130381,0.51867
6,-2.228914,0.484411,0.51867
7,-0.269365,0.738296,0.51867
8,-0.981815,0.139727,0.294662
9,-0.296223,-0.4039,1.406658


### Dropping NA with dropna

In [47]:
dff.dropna(axis=0)

Unnamed: 0,A,B,C
0,0.725832,1.050192,0.442671
1,0.001439,-0.609294,0.544843
2,-0.071433,-0.491755,1.387105
8,-0.981815,0.139727,0.294662
9,-0.296223,-0.4039,1.406658


In [49]:
dff['A'].dropna()

0    0.725832
1    0.001439
2   -0.071433
5    0.826667
6   -2.228914
7   -0.269365
8   -0.981815
9   -0.296223
Name: A, dtype: float64

### Replacing

In [50]:
ser = pd.Series([0., 1., 2., 3., 4.])

In [51]:
ser.replace(0,5)

0    5.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [52]:
#mapping replacement
ser.replace({0: 10, 1: 100})

0     10.0
1    100.0
2      2.0
3      3.0
4      4.0
dtype: float64

In [53]:
df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]})

In [54]:
df.replace({'a':0,'b':5}, 100)

Unnamed: 0,a,b
0,100,100
1,1,6
2,2,7
3,3,8
4,4,9


In [55]:
ser.replace([1,2,3], method='pad')

0    0.0
1    0.0
2    0.0
3    0.0
4    4.0
dtype: float64

### String / regular expression replacement

In [56]:
d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}

In [57]:
df = pd.DataFrame(d)

In [58]:
df.replace('.', np.nan)

Unnamed: 0,a,b,c
0,0,a,a
1,1,b,b
2,2,,
3,3,,d


In [59]:
#doing thiswith a regular expression
df.replace(r'\s*\.\s*', np.nan, regex= True)

Unnamed: 0,a,b,c
0,0,a,a
1,1,b,b
2,2,,
3,3,,d


In [60]:
df.replace(['a', '.'], ['b', np.nan])

Unnamed: 0,a,b,c
0,0,b,b
1,1,b,b
2,2,,
3,3,,d


## Interpolation --scipy documentation

## options, set_options, get_options,  and reset_options

In [62]:
pd.get_option('max_info_rows')

1690785

In [65]:
pd.get_option('max_info_columns')

100

In [66]:
pd.set_option('max_info_columns', 200)

In [67]:
pd.get_option('max_info_columns')

200

In [68]:
pd.reset_option('max_info_columns')

In [69]:
pd.get_option('max_info_columns')

100

In [70]:
pd.options.display.encoding #oprtion give the default behaviour

'UTF-8'

In [71]:
pd.options.display.memory_usage

True

In [72]:
pd.options.display.max_columns

20

In [73]:
pd.options.display.max_colwidth

50

In [74]:
pd.options.display.max_info_columns

100

In [75]:
pd.set_option('display.max_colwidth', 60)

In [76]:
pd.get_option('display.max_colwidth')

60

In [77]:
pd.reset_option('display.max_colwidth')

In [78]:
pd.get_option('display.max_colwidth')

50

In [None]:
pd.options.compute.use_numexpr