In [2]:
import numpy as np
import pandas as pd

## Series

In [3]:
# pd.Series(data, index=index)
data = pd.Series([0.25,0.5,0.75,1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
data.index = ['a','b','c','d']
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

## DataFrame

In [48]:
# pd.DataFrame()
#From a single Series object
population_dict = {'California': 3800,
                    'Texas': 4000,
                    'New York': 5000,
                    'Florida': 390}
population = pd.Series(population_dict)
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,3800
Texas,4000
New York,5000
Florida,390


In [49]:
#From a list of dicts
data = [{'a':i, 'b': 2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [50]:
pd.DataFrame([{'a':1, 'b':2}, {'b':3,'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [51]:
#From a dictionary of Series objects
area_dict = {'California': 100,
             'Texas': 300,
             'New York': 200,
             'Florida': 90}
area = pd.Series(area_dict)
states = pd.DataFrame({'population': population, 'area': area})
print(states.index)
states

Index(['California', 'Texas', 'New York', 'Florida'], dtype='object')


Unnamed: 0,population,area
California,3800,100
Texas,4000,300
New York,5000,200
Florida,390,90


In [52]:
#From a two dimensional numpy array
pd.DataFrame(np.random.rand(3,2),
             columns=['foo','bar'],
             index=['a','b','c']
            )

Unnamed: 0,foo,bar
a,0.6649,0.835071
b,0.176314,0.114313
c,0.222802,0.31713


In [53]:
A = np.zeros(3,dtype=[('A','i8'),('B','f8')])
print(A)
pd.DataFrame(A)

[(0, 0.) (0, 0.) (0, 0.)]


Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


## Indexing

In [54]:
states = states.rename(columns = {'population':'pop'})

In [57]:
states['density'] = states['pop'] / states['area']

In [58]:
states.T

Unnamed: 0,California,Texas,New York,Florida
pop,3800.0,4000.0,5000.0,390.0
area,100.0,300.0,200.0,90.0
density,38.0,13.333333,25.0,4.333333


In [59]:
#Array style indexing loc, iloc, and ix
states.iloc[:3,:2]

Unnamed: 0,pop,area
California,3800,100
Texas,4000,300
New York,5000,200


In [61]:
states.loc[:'Texas',:'pop']

Unnamed: 0,pop
California,3800
Texas,4000


In [62]:
states.ix[:3,:'pop'] #Mixed approach

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,pop
California,3800
Texas,4000
New York,5000


## Manipulation

In [3]:
A = pd.Series([2,4,5], index=[0,1,2])
B = pd.Series([1,3,5], index=[1,2,3])
A+B

0    NaN
1    5.0
2    8.0
3    NaN
dtype: float64

In [4]:
#If you want another value to complete the missing indexes 
A.add(B,fill_value=0)

0    2.0
1    5.0
2    8.0
3    5.0
dtype: float64

In [6]:
rng = np.random.RandomState(42)
A=pd.DataFrame(rng.randint(0,20,(2,2)),columns=list('AB'))
B=pd.DataFrame(rng.randint(0,10,(3,3)),columns=list('BAC'))
A+B

Unnamed: 0,A,B,C
0,10.0,26.0,
1,16.0,19.0,
2,,,


In [9]:
fill = A.stack().mean() #first stack all the entries in A to get a single #
fill

12.25

In [11]:
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,10.0,26.0,18.25
1,16.0,19.0,18.25
2,16.25,19.25,15.25


In [12]:
#Operations Between DataFrame and Series
A = rng.randint(10,size=(3,4))
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-3,-6,5,0
2,-6,-3,-2,4


In [13]:
#Operate Column-wise
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,0,0,-5,-2
1,3,0,6,4
2,-3,0,-4,5


In [14]:
#Operations automatically align indices
halfrow = df.iloc[0,::2]
halfrow

Q    7
S    2
Name: 0, dtype: int32