In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'],columns=['one', 'two', 'three'])

In [5]:
df['four'] = 'bar'

In [6]:
df['five'] = df['one'] > 0

In [7]:
df

Unnamed: 0,one,two,three,four,five
a,-0.432938,0.154795,-0.649918,bar,False
c,0.838026,0.71695,1.469447,bar,True
e,-1.248365,0.886771,0.550933,bar,False
f,-0.07164,0.981329,-1.599551,bar,False
h,-1.146178,-1.055358,-0.588433,bar,False


In [8]:
df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

In [9]:
df2

Unnamed: 0,one,two,three,four,five
a,-0.432938,0.154795,-0.649918,bar,False
b,,,,,
c,0.838026,0.71695,1.469447,bar,True
d,,,,,
e,-1.248365,0.886771,0.550933,bar,False
f,-0.07164,0.981329,-1.599551,bar,False
g,,,,,
h,-1.146178,-1.055358,-0.588433,bar,False


In [10]:
pd.isnull(df2['one'])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [11]:
df2['four'].notnull()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool

#datetimes

In [12]:
df2 = df.copy()
df2['timestamp'] = pd.Timestamp('20120101')
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,-0.432938,0.154795,-0.649918,bar,False,2012-01-01
c,0.838026,0.71695,1.469447,bar,True,2012-01-01
e,-1.248365,0.886771,0.550933,bar,False,2012-01-01
f,-0.07164,0.981329,-1.599551,bar,False,2012-01-01
h,-1.146178,-1.055358,-0.588433,bar,False,2012-01-01


In [13]:
df2.loc[['a','c','h'],['one','timestamp']] = np.nan
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.154795,-0.649918,bar,False,NaT
c,,0.71695,1.469447,bar,True,NaT
e,-1.248365,0.886771,0.550933,bar,False,2012-01-01
f,-0.07164,0.981329,-1.599551,bar,False,2012-01-01
h,,-1.055358,-0.588433,bar,False,NaT


In [14]:
df2.get_dtype_counts()

bool              1
datetime64[ns]    1
float64           3
object            1
dtype: int64

# Inserting missing data

In [15]:
s = pd.Series([1, 2, 3])
s.loc[0] = None
s

0    NaN
1    2.0
2    3.0
dtype: float64

In [16]:
s = pd.Series(["a", "b", "c"])
s.loc[0] = None
s.loc[1] = np.nan
s

0    None
1     NaN
2       c
dtype: object

In [19]:
df2.groupby('one').mean()

Unnamed: 0_level_0,two,three,five
one,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.248365,0.886771,0.550933,False
-0.07164,0.981329,-1.599551,False


Cleaning / filling missing data

In [20]:
df2.fillna(0)

Unnamed: 0,one,two,three,four,five,timestamp
a,0.0,0.154795,-0.649918,bar,False,1970-01-01
c,0.0,0.71695,1.469447,bar,True,1970-01-01
e,-1.248365,0.886771,0.550933,bar,False,2012-01-01
f,-0.07164,0.981329,-1.599551,bar,False,2012-01-01
h,0.0,-1.055358,-0.588433,bar,False,1970-01-01


In [22]:
df2.fillna(method='pad')

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.154795,-0.649918,bar,False,NaT
c,,0.71695,1.469447,bar,True,NaT
e,-1.248365,0.886771,0.550933,bar,False,2012-01-01
f,-0.07164,0.981329,-1.599551,bar,False,2012-01-01
h,-0.07164,-1.055358,-0.588433,bar,False,2012-01-01


In [23]:
df2.fillna(method='pad', limit=1)#fill only one element

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.154795,-0.649918,bar,False,NaT
c,,0.71695,1.469447,bar,True,NaT
e,-1.248365,0.886771,0.550933,bar,False,2012-01-01
f,-0.07164,0.981329,-1.599551,bar,False,2012-01-01
h,-0.07164,-1.055358,-0.588433,bar,False,2012-01-01


Method	Action

pad / ffill	Fill values forward

bfill / backfill	Fill values backward

The ffill() function is equivalent to fillna(method='ffill') and bfill() is equivalent to fillna(method='bfill')

In [24]:
dff = pd.DataFrame(np.random.randn(10,3), columns=list('ABC'))
dff.iloc[3:5,0] = np.nan
dff.iloc[4:6,1] = np.nan
dff.iloc[5:8,2] = np.nan
dff

Unnamed: 0,A,B,C
0,1.412756,-0.135644,-1.036764
1,-0.605971,0.966201,0.809862
2,0.132554,-1.289135,-0.112851
3,,-0.862761,1.001583
4,,,-0.414388
5,-0.703988,,
6,0.405966,1.81729,
7,0.697515,-0.573162,
8,-0.469294,-0.81298,0.036271
9,1.276822,1.175196,-0.420548


In [25]:
dff.fillna(dff.mean())

Unnamed: 0,A,B,C
0,1.412756,-0.135644,-1.036764
1,-0.605971,0.966201,0.809862
2,0.132554,-1.289135,-0.112851
3,0.268295,-0.862761,1.001583
4,0.268295,0.035626,-0.414388
5,-0.703988,0.035626,-0.019548
6,0.405966,1.81729,-0.019548
7,0.697515,-0.573162,-0.019548
8,-0.469294,-0.81298,0.036271
9,1.276822,1.175196,-0.420548


In [26]:
dff.fillna(dff.mean()['B':'C'])

Unnamed: 0,A,B,C
0,1.412756,-0.135644,-1.036764
1,-0.605971,0.966201,0.809862
2,0.132554,-1.289135,-0.112851
3,,-0.862761,1.001583
4,,0.035626,-0.414388
5,-0.703988,0.035626,-0.019548
6,0.405966,1.81729,-0.019548
7,0.697515,-0.573162,-0.019548
8,-0.469294,-0.81298,0.036271
9,1.276822,1.175196,-0.420548


Dropping axis labels with missing data: dropna

In [27]:
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.154795,-0.649918,bar,False,NaT
c,,0.71695,1.469447,bar,True,NaT
e,-1.248365,0.886771,0.550933,bar,False,2012-01-01
f,-0.07164,0.981329,-1.599551,bar,False,2012-01-01
h,,-1.055358,-0.588433,bar,False,NaT


In [30]:
df2.dropna(axis=1)

Unnamed: 0,two,three,four,five
a,0.154795,-0.649918,bar,False
c,0.71695,1.469447,bar,True
e,0.886771,0.550933,bar,False
f,0.981329,-1.599551,bar,False
h,-1.055358,-0.588433,bar,False


In [32]:
df2.dropna(axis=0)

Unnamed: 0,one,two,three,four,five,timestamp
e,-1.248365,0.886771,0.550933,bar,False,2012-01-01
f,-0.07164,0.981329,-1.599551,bar,False,2012-01-01


Interpolation

In [35]:
df2.interpolate()

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.154795,-0.649918,bar,False,NaT
c,,0.71695,1.469447,bar,True,NaT
e,-1.248365,0.886771,0.550933,bar,False,2012-01-01
f,-0.07164,0.981329,-1.599551,bar,False,2012-01-01
h,-0.07164,-1.055358,-0.588433,bar,False,NaT


In [37]:
ts=pd.DataFrame(np.random.randn(5, 1),columns=['one'])
ts['timestamp'] = pd.Timestamp('20120101')

In [52]:
ts=ts[["timestamp","one"]]

In [53]:
ts

Unnamed: 0,timestamp,one
0,2012-01-01,-0.49109
1,2012-01-01,-1.195053
2,2012-01-01,-0.456527
3,2012-01-01,0.954915
4,2012-01-01,0.273422


In [55]:
ts.one[2]=np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [56]:
ts

Unnamed: 0,timestamp,one
0,2012-01-01,-0.49109
1,2012-01-01,-1.195053
2,2012-01-01,
3,2012-01-01,0.954915
4,2012-01-01,0.273422


In [57]:
ts.interpolate()

Unnamed: 0,timestamp,one
0,2012-01-01,-0.49109
1,2012-01-01,-1.195053
2,2012-01-01,-0.120069
3,2012-01-01,0.954915
4,2012-01-01,0.273422


The method argument gives access to fancier interpolation methods. If you have scipy installed, you can set pass the name of a 1-d interpolation routine to method. You’ll want to consult the full scipy interpolation documentation and reference guide for details. The appropriate interpolation method will depend on the type of data you are working with.

In [58]:
ts.interpolate(method='barycentric')

Unnamed: 0,timestamp,one
0,2012-01-01,-0.49109
1,2012-01-01,-1.195053
2,2012-01-01,-0.123814
3,2012-01-01,0.954915
4,2012-01-01,0.273422


In [60]:
ts.interpolate(method='pchip')

Unnamed: 0,timestamp,one
0,2012-01-01,-0.49109
1,2012-01-01,-1.195053
2,2012-01-01,-0.120069
3,2012-01-01,0.954915
4,2012-01-01,0.273422


In [62]:
ts.interpolate(method='akima')

Unnamed: 0,timestamp,one
0,2012-01-01,-0.49109
1,2012-01-01,-1.195053
2,2012-01-01,-0.120069
3,2012-01-01,0.954915
4,2012-01-01,0.273422


In [63]:
ts.interpolate(method='spline', order=2)#polinomial interpolation

Unnamed: 0,timestamp,one
0,2012-01-01,-0.49109
1,2012-01-01,-1.195053
2,2012-01-01,-0.123814
3,2012-01-01,0.954915
4,2012-01-01,0.273422


Replacing Generic Values

In [64]:
ser = pd.Series([0., 1., 2., 3., 4.])

In [65]:
ser.replace(0, 5)

0    5.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [66]:
ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])

0    4.0
1    3.0
2    2.0
3    1.0
4    0.0
dtype: float64

In [67]:
ser

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64