In [44]:
#McKinney Conv.
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [45]:
#Series are a one-dim array-like object containing one dim of data + labels
obj = Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [46]:
obj.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [47]:
obj2 = Series([4,7,-5,3], index = ['d', 'b', 'a', 'c'])
obj2['a'] = 99
obj2['a']

99

In [48]:
obj2*2

d      8
b     14
a    198
c      6
dtype: int64

In [49]:
# can also be thought of as a dictionary.
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)

In [50]:
new_states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=new_states)
obj4

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [51]:
#default for missing values is NaN.
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [52]:
#automatic index alignment
print(obj3, '\\n', obj4, '\\n', obj3+obj4)

(Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64, '\\n', California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64, '\\n', California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64)


In [53]:
# also has a useful 'name' attribute
obj4.name = 'population'; obj4.index.name = 'state'
obj4

state
California      NaN
Ohio          35000
Oregon        16000
Texas         71000
Name: population, dtype: float64

In [54]:
#DataFrames
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data, columns=['year', 'state', 'pop'])
frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [55]:
frame2 = DataFrame(data, columns =['year', 'state', 'pop', 'debt'],
                   index = ['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [56]:
frame2.columns

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

In [57]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [58]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [59]:
frame2.ix['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [60]:
frame2.debt = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [61]:
frame2.debt = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [62]:
val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame2.debt = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [63]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [64]:
del frame2['eastern']

In [65]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
        'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [66]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [67]:
DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [68]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [69]:
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [70]:
#multitype accomodation
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

### 8-27-15

In [73]:
obj = Series(range(3), index=['a', 'b', 'c'])
obj.index[1:]

Index([u'b', u'c'], dtype='object')

In [74]:
obj2 = obj.reindex(['c', 'b', 'a'])
obj2

c    2
b    1
a    0
dtype: int64

In [75]:
obj.reindex(['c', 'b', 'a', 'z'], fill_value = -99)

c     2
b     1
a     0
z   -99
dtype: int64

In [77]:
obj.drop('b')

a    0
c    2
dtype: int64

In [78]:
data = DataFrame(np.arange(16).reshape(4,4), index= ['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns= ['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [79]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [80]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [81]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [90]:
[sum(data[col]) < 30 for col in data]

[True, True, False, False]

In [91]:
data.ix['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [94]:
df = DataFrame(np.arange(12.).reshape((4,3)), columns = list('bde'),
               index= ['Utah', 'Ohio', 'Texas', 'Oregon'])
ser = df.ix[0]

df

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [95]:
df - ser

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [96]:
ser2 = df['d']
df.sub(ser2, axis=0)

Unnamed: 0,b,d,e
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregon,-1,0,1


### 8-28-15

In [98]:
obj= Series(range(4), index = ['d', 'a', 'c', 'b'])
obj

d    0
a    1
c    2
b    3
dtype: int64

In [100]:
obj.sort_index()

a    1
b    3
c    2
d    0
dtype: int64

In [101]:
frame = DataFrame(np.arange(8).reshape((2,4)), index = ['three', 'one'],
                  columns = obj.index)
frame

Unnamed: 0,d,a,c,b
three,0,1,2,3
one,4,5,6,7


In [102]:
frame.sort_index()

Unnamed: 0,d,a,c,b
one,4,5,6,7
three,0,1,2,3


In [103]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,3,2,0
one,5,7,6,4


In [104]:
frame.sort_index(by='b')

Unnamed: 0,d,a,c,b
three,0,1,2,3
one,4,5,6,7


In [105]:
frame.sort_index(by='b', ascending=False)

Unnamed: 0,d,a,c,b
one,4,5,6,7
three,0,1,2,3


In [106]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [107]:
help(obj.rank)

Help on method rank in module pandas.core.series:

rank(self, method='average', na_option='keep', ascending=True, pct=False) method of pandas.core.series.Series instance
    Compute data ranks (1 through n). Equal values are assigned a rank that
    is the average of the ranks of those values
    
    Parameters
    ----------
    method : {'average', 'min', 'max', 'first', 'dense'}
        * average: average rank of group
        * min: lowest rank in group
        * max: highest rank in group
        * first: ranks assigned in order they appear in the array
        * dense: like 'min', but rank always increases by 1 between groups
    na_option : {'keep'}
        keep: leave NA values where they are
    ascending : boolean, default True
        False for ranks by high (1) to low (N)
    pct : boolean, default False
        Computes percentage rank of data
    
    Returns
    -------
    ranks : Series



In [108]:
obj = Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])
obj.index.is_unique

False

In [109]:
obj['a']

a    0
a    1
dtype: int64

In [110]:
df

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [111]:
df.sum()


b    18
d    22
e    26
dtype: float64

In [112]:
df.sum(axis=1)

Utah       3
Ohio      12
Texas     21
Oregon    30
dtype: float64

In [113]:
df.mean(axis=1, skipna = False)

Utah       1
Ohio       4
Texas      7
Oregon    10
dtype: float64

In [114]:
df.cumsum()


Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,5,7
Texas,9,12,15
Oregon,18,22,26


In [116]:
df.describe()

Unnamed: 0,b,d,e
count,4.0,4.0,4.0
mean,4.5,5.5,6.5
std,3.872983,3.872983,3.872983
min,0.0,1.0,2.0
25%,2.25,3.25,4.25
50%,4.5,5.5,6.5
75%,6.75,7.75,8.75
max,9.0,10.0,11.0


### Stock data example

In [118]:
import pandas.io.data as web

all_data = {}
for ticker in ['QQQ', 'SPY', 'IWM', 'TSLA', 'GOOGL']:
    all_data[ticker] = web.get_data_yahoo(ticker)

price = DataFrame({tic: data['Adj Close'] for tic,data in all_data.iteritems()})
volume = DataFrame({tic: data['Volume'] for tic,data in all_data.iteritems()})

In [123]:
returns = price.pct_change()

In [124]:
returns.tail()

Unnamed: 0_level_0,GOOGL,IWM,QQQ,SPY,TSLA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-08-21,-0.052172,-0.013044,-0.043706,-0.031273,-0.047114
2015-08-24,-0.040247,-0.038779,-0.038477,-0.040884,-0.051567
2015-08-25,-0.009125,-0.008141,-0.003758,-0.01224,0.0053
2015-08-26,0.077179,0.025809,0.050362,0.039791,0.021861
2015-08-27,0.01246,0.01787,0.025332,0.023012,0.080724


In [126]:
returns.TSLA.corr(returns.QQQ)

0.39320297172530205

In [127]:
all_data['TSLA'].head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-06-29,19.0,25.0,17.540001,23.889999,18766300,23.889999
2010-06-30,25.790001,30.42,23.299999,23.83,17187100,23.83
2010-07-01,25.0,25.92,20.27,21.959999,8218800,21.959999
2010-07-02,23.0,23.1,18.709999,19.200001,5139800,19.200001
2010-07-06,20.0,20.0,15.83,16.110001,6866900,16.110001


In [128]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [129]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [130]:
mask = obj.isin(['b', 'c'])
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

### 8-29-15

In [132]:
string_data = Series(['aard', np.nan, 'vark'])
string_data.isnull()

0    False
1     True
2    False
dtype: bool

In [135]:
df.ix['Ohio', 'e'] = np.nan
df

Unnamed: 0,b,d,e
Utah,0,1,2.0
Ohio,3,4,
Texas,6,7,8.0
Oregon,9,10,11.0


In [137]:
df.dropna()

Unnamed: 0,b,d,e
Utah,0,1,2
Texas,6,7,8
Oregon,9,10,11


In [139]:
df.dropna(how='all') # all elems of row must be null

Unnamed: 0,b,d,e
Utah,0,1,2.0
Ohio,3,4,
Texas,6,7,8.0
Oregon,9,10,11.0


In [141]:
df.dropna(thresh=3)

Unnamed: 0,b,d,e
Utah,0,1,2
Texas,6,7,8
Oregon,9,10,11


In [145]:
df.fillna(-999)

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,-999
Texas,6,7,8
Oregon,9,10,11


In [147]:
df.fillna({'e': -99})

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,-99
Texas,6,7,8
Oregon,9,10,11


In [149]:
df.fillna(method='ffill', limit=1)

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,2
Texas,6,7,8
Oregon,9,10,11


In [154]:
data = Series(np.random.randn(10), index=[['a','a','a','b','b','b','c','c','d','d'],
              [1,2,3,1,2,3,1,2,2,3]])
data

a  1    0.594781
   2   -0.805142
   3   -0.222744
b  1   -0.037135
   2    0.129402
   3   -0.696443
c  1   -0.604512
   2    1.659121
d  2   -0.249860
   3   -0.325105
dtype: float64

In [157]:
data['a', 2]

-0.80514215049034721

In [158]:
data[:,2]

a   -0.805142
b    0.129402
c    1.659121
d   -0.249860
dtype: float64

In [159]:
data.unstack()

Unnamed: 0,1,2,3
a,0.594781,-0.805142,-0.222744
b,-0.037135,0.129402,-0.696443
c,-0.604512,1.659121,
d,,-0.24986,-0.325105


In [160]:
frame = DataFrame(np.arange(12).reshape((4,3)),
                  index=[['a','a','b','b'], [1,2,1,2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [161]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [162]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [163]:
frame.sortlevel(1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [165]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [167]:
df

Unnamed: 0,b,d,e
Utah,0,1,2.0
Ohio,3,4,
Texas,6,7,8.0
Oregon,9,10,11.0


In [169]:
df.set_index(['b', 'd'])

Unnamed: 0_level_0,Unnamed: 1_level_0,e
b,d,Unnamed: 2_level_1
0,1,2.0
3,4,
6,7,8.0
9,10,11.0
