# Config

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

# Data Structures

__Series__

In [113]:
# dict
d = {'a' : 0., 'b' : 1., 'c' : 2.}
pd.Series(d, index=['b', 'c', 'd', 'a'])
# scalar
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])
# ndarray
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [114]:
# A key difference between Series and ndarray is that operations between Series automatically align the data 
#based on label
# The result of an operation between unaligned Series will have the union of the indexes involved. If a 
#label is not found in one Series or the other, the result will be marked as missing NaN
s[1:] + s[:-1]

a         NaN
b    0.158443
c   -0.693313
d    0.311148
e         NaN
dtype: float64

In [86]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

__DataFrame__

In [117]:
# list of dicts to DataFrame
data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [119]:
# numpy array
dates = pd.date_range('20130101', periods=6)
df1 = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df1['E'] = s1; 
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.016033,1.024962,-0.065598,1.723671,
2013-01-02,-0.000118,0.059088,-1.595061,-1.99199,1.0
2013-01-03,0.719443,-1.544791,0.954769,0.36276,2.0
2013-01-04,1.892284,-0.362018,-1.432996,-1.618781,3.0
2013-01-05,-0.169691,-0.90017,0.916865,-0.344216,4.0
2013-01-06,-0.890566,-0.730159,0.07025,0.490438,5.0


In [128]:
# Series
df2 = pd.DataFrame({ 'A' : 1.,
                        'B' : pd.Timestamp('20130102'),
                        'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                        'D' : np.array([3] * 4,dtype='int32'),
                        'E' : pd.Categorical(["test","train","test","train"]),
                        'F' : 'foo' })
df2['G'] = ['one', 'three','two','four']
df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,one
1,1.0,2013-01-02,1.0,3,train,foo,three
2,1.0,2013-01-02,1.0,3,test,foo,two
3,1.0,2013-01-02,1.0,3,train,foo,four


__Operations__

In [131]:
df2.index
df2.columns
df2.values
df2.shape
df2.describe
df2.head
df2.sample(frac=0.5)
df2.T
df2.sort_index(axis=1, ascending=False)
df2.sort_values(by='B')
df2.rename(columns={'F':'H'})
df2.drop(['F'],axis=1)
df2[['F', 'E']] = df2[['E', 'F']]
print('done')

done


# Indexing, Selecting, and Subsetting

| Operation                      | Syntax        | Result    |
|--------------------------------|---------------|-----------|
| Select column                  | df[col]       | Series    |
| Select row by label            | df.loc[label] | Series    |
| Select row by integer location | df.iloc[loc]  | Series    |
| Slice rows                     | df[5:10]      | DataFrame |
| Select rows by boolean vector  | df[bool_vec]  | DataFrame |

In [94]:
df1.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [58]:
# select rows with df
df2[df2.A>0]
df2[df2['G'].isin(['one','two'])]

Unnamed: 0,A,B,C,D,E,F,G,logic
0,1.0,2013-01-02,1.0,-1,test,foo,one,True
2,1.0,2013-01-02,1.0,-1,test,foo,two,True


In [8]:
# select rows,cols with location
df1.loc[dates[0]]
df1.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.582794,1.040862
2013-01-02,0.05642,0.405451
2013-01-03,-0.194486,2.71594
2013-01-04,-0.441587,1.77948
2013-01-05,0.096748,-0.856378
2013-01-06,-0.380816,-0.293931


In [9]:
# select rows,cols with index
df1.iloc[3]
df1.iloc[:,1:2]

Unnamed: 0,B
2013-01-01,1.040862
2013-01-02,0.405451
2013-01-03,2.71594
2013-01-04,1.77948
2013-01-05,-0.856378
2013-01-06,-0.293931


In [97]:
# numexpr is faster than python
df1.query(' A<0 & E_cat==["Alpha","Beta"] ')

Unnamed: 0,A,B,C,D,E,E_cat
2013-01-02,-0.110822,-0.298799,2.240272,-0.824189,1.0,Alpha
2013-01-03,-0.119343,0.104817,-1.120993,0.328065,2.0,Beta


In [61]:
# multi-selection on index
df2[(df2.A <= 6) & (df2.index.isin([0,2,4]))]
# inverse operator (~) for the complement
df2[ ~( (df2.A <= 6) & (df2.index.isin([0,2,4])) ) ]

Unnamed: 0,A,B,C,D,E,F,G,logic
1,1.0,2013-01-02,1.0,-1,train,foo,three,True
3,1.0,2013-01-02,1.0,-1,train,foo,four,True


In [63]:
# replacement: if-then on one column
df2.loc[df2.D >= 2,'D'] = -1 
df2['logic'] = np.where(df2.C>0, True, False)
df2

Unnamed: 0,A,B,C,D,E,F,G,logic
0,1.0,2013-01-02,1.0,-1,test,foo,one,True
1,1.0,2013-01-02,1.0,-1,train,foo,three,True
2,1.0,2013-01-02,1.0,-1,test,foo,two,True
3,1.0,2013-01-02,1.0,-1,train,foo,four,True


# Columns with Numeric and Categorical Data

In [124]:
# new column
s = pd.Series(["a","b","c","a"], dtype="category")
# casting
df = pd.DataFrame({"A":["a","b","c","a"]})
df["B"] = df["A"].astype('category'); df
# special function
df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ]
df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
# Categorical object to Series
raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"],ordered=False)
s = pd.Series(raw_cat); s

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b, c, d]

In [125]:
# using codes and categories
splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]));s

0    test
1    test
2    test
3    test
4    test
dtype: category
Categories (2, object): [train, test]

In [132]:
# inferred order
s = pd.Series(["a","b","c","a"], dtype="category")
s.cat.categories
s.cat.ordered

False

In [136]:
# explicit order
s = pd.Series(pd.Categorical(["a","b","c","a"], categories=["c","b","a"], ordered=True))
s.cat.categories
s.cat.ordered

True

In [142]:
# renaming categories
s = pd.Series(["a","b","c","a"], dtype="category")
s.cat.categories = ["Group %s" % g for g in s.cat.categories]
s.cat.rename_categories([1,2,3])

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [1, 2, 3]

In [148]:
# reorder categories
s = pd.Series(["a","b","c","a"], dtype="category")
s.cat.reorder_categories(['c','a','b'])

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [c, a, b]

In [149]:
# add category
s = s.cat.add_categories([4])
# remove category
s = s.cat.remove_categories([4])
# remove unused categories
s.cat.remove_unused_categories()

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

In [150]:
# new columns from old
source_cols = df1.columns[4:5]    
new_cols = [str(x) + "_cat" for x in source_cols]
categories = {1 : 'Alpha', 2 : 'Beta', 3 : 'Charlie' }
df1[new_cols] = df1[source_cols].applymap(categories.get);df1

Unnamed: 0,A,B,C,D,E,E_cat
2013-01-01,-0.016033,1.024962,-0.065598,1.723671,,
2013-01-02,-0.000118,0.059088,-1.595061,-1.99199,1.0,Alpha
2013-01-03,0.719443,-1.544791,0.954769,0.36276,2.0,Beta
2013-01-04,1.892284,-0.362018,-1.432996,-1.618781,3.0,Charlie
2013-01-05,-0.169691,-0.90017,0.916865,-0.344216,4.0,
2013-01-06,-0.890566,-0.730159,0.07025,0.490438,5.0,


# Missing or Duplicated Values

In [11]:
np.nan
df1.dropna(how='any')
df1.fillna(value=0)
df1.drop_duplicates()
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,True
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,False
2013-01-04,False,False,False,False,False
2013-01-05,False,False,False,False,False
2013-01-06,False,False,False,False,False


In [48]:
df1.loc[df1.E.isnull()==True, 'E'] = 0
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.582794,1.040862,-0.429571,-0.142588,0.0
2013-01-02,0.05642,0.405451,-0.594392,0.990542,1.0
2013-01-03,-0.194486,2.71594,-2.199434,1.955261,2.0
2013-01-04,-0.441587,1.77948,-0.305055,0.247999,3.0
2013-01-05,0.096748,-0.856378,-0.663669,0.006469,4.0
2013-01-06,-0.380816,-0.293931,1.046433,1.609351,5.0


# Shaping

In [43]:
# long
tmpdf = df1.stack()
# wide
tmpdf.unstack()

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.582794,1.040862,-0.429571,-0.142588,0.0
2013-01-02,0.05642,0.405451,-0.594392,0.990542,1.0
2013-01-03,-0.194486,2.71594,-2.199434,1.955261,2.0
2013-01-04,-0.441587,1.77948,-0.305055,0.247999,3.0
2013-01-05,0.096748,-0.856378,-0.663669,0.006469,4.0
2013-01-06,-0.380816,-0.293931,1.046433,1.609351,5.0


In [92]:
# append
tmpdf = df1.append(df1); tmpdf
# concat vertically
tmpdf = pd.concat([df1,df1])
# concat horizontally
pd.concat([df1,df1], axis=1)

Unnamed: 0,A,B,C,D,E,E_cat,A.1,B.1,C.1,D.1,E.1,E_cat.1
2013-01-01,-1.33105,-1.495858,-1.094061,1.629035,,,-1.33105,-1.495858,-1.094061,1.629035,,
2013-01-02,-0.110822,-0.298799,2.240272,-0.824189,1.0,Alpha,-0.110822,-0.298799,2.240272,-0.824189,1.0,Alpha
2013-01-03,-0.119343,0.104817,-1.120993,0.328065,2.0,Beta,-0.119343,0.104817,-1.120993,0.328065,2.0,Beta
2013-01-04,-0.340357,-0.272103,0.761724,0.75124,3.0,Charlie,-0.340357,-0.272103,0.761724,0.75124,3.0,Charlie
2013-01-05,-0.353342,0.207484,1.302108,-0.554947,4.0,,-0.353342,0.207484,1.302108,-0.554947,4.0,
2013-01-06,0.632763,0.462542,0.360557,1.189863,5.0,,0.632763,0.462542,0.360557,1.189863,5.0,


In [111]:
# merge on index
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
pd.merge(left,right, how='left')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [109]:
# differently-indexed DataFrames
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df1 = df.loc[1:, ['A', 'B']]
df2 = df.loc[:5, ['C', 'D']]
df1.join(df2, how='inner')

Unnamed: 0,A,B,C,D
1,0.47144,-0.723579,0.274226,-0.497839
2,2.001739,-1.168126,-1.747391,-1.393374
3,-1.239961,0.324315,-0.115042,-2.416667
4,-0.587547,-0.618463,-0.911924,1.538828
5,0.209617,-0.525029,-0.878166,0.882554
