In [96]:
# Data Structures - 1. Series , 2. DataFrame
import pandas as pd
from pandas import Series,DataFrame

# Series
obj = Series([4,-8,-12,0,8])
obj 

0     4
1    -8
2   -12
3     0
4     8
dtype: int64

In [97]:
obj.index

RangeIndex(start=0, stop=5, step=1)

In [98]:
obj.values

array([  4,  -8, -12,   0,   8])

In [99]:
obj2 = Series([3,4,5,2,1],index = ['r','e','f','a','b']) # Assigning Index to each value

In [100]:
obj2

r    3
e    4
f    5
a    2
b    1
dtype: int64

In [101]:
obj2.index

Index(['r', 'e', 'f', 'a', 'b'], dtype='object')

In [102]:
obj2['a']

2

In [103]:
obj2[['r','f','b']]

r    3
f    5
b    1
dtype: int64

In [104]:
obj2[obj2>2]

r    3
e    4
f    5
dtype: int64

In [105]:
obj2 * 2

r     6
e     8
f    10
a     4
b     2
dtype: int64

In [106]:
import numpy as np
np.exp(obj2)

r     20.085537
e     54.598150
f    148.413159
a      7.389056
b      2.718282
dtype: float64

In [107]:
'r' in obj2

True

In [108]:
# Dict to Series
sdata = {'Chennai': 5000,'Mumbai':9800,'Delhi':7500,'Kolkata':3099}
obj3 = Series(sdata)
obj3

Chennai    5000
Delhi      7500
Kolkata    3099
Mumbai     9800
dtype: int64

In [109]:
sdata['Bangalore'] = 6500

In [110]:
obj3

Chennai    5000
Delhi      7500
Kolkata    3099
Mumbai     9800
dtype: int64

In [111]:
obj3 = Series(sdata)
obj3

Bangalore    6500
Chennai      5000
Delhi        7500
Kolkata      3099
Mumbai       9800
dtype: int64

In [112]:
city = ['Jodhpur','Chennai','Mumbai','Delhi']

In [113]:
obj4 = Series(sdata,index=city)
obj4

Jodhpur       NaN
Chennai    5000.0
Mumbai     9800.0
Delhi      7500.0
dtype: float64

In [114]:
pd.isnull(obj4)

Jodhpur     True
Chennai    False
Mumbai     False
Delhi      False
dtype: bool

In [115]:
pd.notnull(obj4)

Jodhpur    False
Chennai     True
Mumbai      True
Delhi       True
dtype: bool

In [116]:
obj3

Bangalore    6500
Chennai      5000
Delhi        7500
Kolkata      3099
Mumbai       9800
dtype: int64

In [117]:
obj4

Jodhpur       NaN
Chennai    5000.0
Mumbai     9800.0
Delhi      7500.0
dtype: float64

In [118]:
obj3 + obj4

Bangalore        NaN
Chennai      10000.0
Delhi        15000.0
Jodhpur          NaN
Kolkata          NaN
Mumbai       19600.0
dtype: float64

In [119]:
obj4.index.name = 'City'

In [120]:
obj4.name = 'Land Value per sq.feet'

In [121]:
obj4

City
Jodhpur       NaN
Chennai    5000.0
Mumbai     9800.0
Delhi      7500.0
Name: Land Value per sq.feet, dtype: float64

In [122]:
# DataFrame
data = {'state':['TN','UP','MP'],'year':['1890','1898','1889'],'pop':['1.4','2.3','1.6']}
frame = DataFrame(data)

In [123]:
frame # It will sort the columns 

Unnamed: 0,pop,state,year
0,1.4,TN,1890
1,2.3,UP,1898
2,1.6,MP,1889


In [124]:
DataFrame(data,columns=['year','state','pop']) # Customize the columns as required !

Unnamed: 0,year,state,pop
0,1890,TN,1.4
1,1898,UP,2.3
2,1889,MP,1.6


In [125]:
frame2 = DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three'])
frame2

Unnamed: 0,year,state,pop,debt
one,1890,TN,1.4,
two,1898,UP,2.3,
three,1889,MP,1.6,


In [126]:
frame2.columns # Viewing Columns in a Table or dataframe

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [127]:
frame2['state']

one      TN
two      UP
three    MP
Name: state, dtype: object

In [128]:
frame2['year'] # getting out column value 

one      1890
two      1898
three    1889
Name: year, dtype: object

In [129]:
frame2.ix['one'] # retrieving one record

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


year     1890
state      TN
pop       1.4
debt      NaN
Name: one, dtype: object

In [130]:
# modifying columns value 
frame2['debt']=3.56
frame2

Unnamed: 0,year,state,pop,debt
one,1890,TN,1.4,3.56
two,1898,UP,2.3,3.56
three,1889,MP,1.6,3.56


In [131]:
frame2['debt'][2] = 4.5
frame2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,year,state,pop,debt
one,1890,TN,1.4,3.56
two,1898,UP,2.3,3.56
three,1889,MP,1.6,4.5


In [132]:
frame2

Unnamed: 0,year,state,pop,debt
one,1890,TN,1.4,3.56
two,1898,UP,2.3,3.56
three,1889,MP,1.6,4.5


In [133]:
import numpy as np
frame2['debt'] = np.arange(3.)

In [134]:
frame2

Unnamed: 0,year,state,pop,debt
one,1890,TN,1.4,0.0
two,1898,UP,2.3,1.0
three,1889,MP,1.6,2.0


In [135]:
val = Series([2.3,4.5,6.7],index=['one','two','three'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,1890,TN,1.4,2.3
two,1898,UP,2.3,4.5
three,1889,MP,1.6,6.7


In [136]:
frame2['eastern'] = frame2.state == 'TN'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,1890,TN,1.4,2.3,True
two,1898,UP,2.3,4.5,False
three,1889,MP,1.6,6.7,False


In [137]:
del frame2['eastern']

In [138]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [139]:
pop = {'Goa':{1889:4.5,1912:5.6},'HP':{1910:'2.8',1900:'1.0'}}

In [140]:
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Goa,HP
1889,4.5,
1900,,1.0
1910,,2.8
1912,5.6,


In [141]:
frame3.T

Unnamed: 0,1889,1900,1910,1912
Goa,4.5,,,5.6
HP,,1.0,2.8,


In [142]:
DataFrame(pop,index=[1889,1910,1900])

Unnamed: 0,Goa,HP
1889,4.5,
1910,,2.8
1900,,1.0


In [143]:
pdata = {'Goa':frame3['Goa'][:-1],'HP':frame3['HP'][:2]}

In [144]:
DataFrame(pdata)

Unnamed: 0,Goa,HP
1889,4.5,
1900,,1.0
1910,,


In [145]:
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Goa,HP
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1889,4.5,
1900,,1.0
1910,,2.8
1912,5.6,


In [146]:
frame3.values

array([[4.5, nan],
       [nan, '1.0'],
       [nan, '2.8'],
       [5.6, nan]], dtype=object)

In [147]:
frame2.values

array([['1890', 'TN', '1.4', 2.3],
       ['1898', 'UP', '2.3', 4.5],
       ['1889', 'MP', '1.6', 6.7]], dtype=object)

In [148]:
iObj = Series(range(3),index=['a','b','c'])
index = iObj.index
index
index[1:]

Index(['b', 'c'], dtype='object')

In [149]:
index[1] = 'f' # index cannot be changed !!!

TypeError: Index does not support mutable operations

In [150]:
index = pd.Index(np.arange(3))
iObj2 = Series([3.6,9.0,2.9],index=index)
iObj2.index is index

True

In [151]:
frame3

state,Goa,HP
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1889,4.5,
1900,,1.0
1910,,2.8
1912,5.6,


In [152]:
'Goa' in frame3.columns

True

In [153]:
'1921' in frame3.index

False

In [154]:
# Reindexing
nObj = Series([4.6,7.0,4.1,3.7],index=['d','v','f','g'])
nObj

d    4.6
v    7.0
f    4.1
g    3.7
dtype: float64

In [155]:
nIndex = nObj.reindex(['d','f','g','v'])
nIndex

d    4.6
f    4.1
g    3.7
v    7.0
dtype: float64

In [156]:
allElements = nObj + nIndex

In [157]:
allElements.reindex(['a','b','c','d','f','g','v'],fill_value=0)

a     0.0
b     0.0
c     0.0
d     9.2
f     8.2
g     7.4
v    14.0
dtype: float64

In [158]:
sIndex = Series(['Blue','Orange','Yellow'],index = [0,4,5] )
sIndex.reindex(range(7),method='ffill')


0      Blue
1      Blue
2      Blue
3      Blue
4    Orange
5    Yellow
6    Yellow
dtype: object

In [159]:
frame = DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Goa','Chennai','Srilanka'])
frame


Unnamed: 0,Goa,Chennai,Srilanka
a,0,1,2
c,3,4,5
d,6,7,8


In [160]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Goa,Chennai,Srilanka
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [167]:
# Columns reindex 
states = ['Goa','Chennai','Mumbai']
frame.reindex(columns=states)

Unnamed: 0,Goa,Chennai,Mumbai
a,0,1,
c,3,4,
d,6,7,


In [169]:
frame.reindex(index=['a','b','c','d'],method='ffill',columns=states) # change in pandas expression method

NameError: name 'self' is not defined

In [172]:
frame.loc[['a','b','c','d'],states]

Unnamed: 0,Goa,Chennai,Mumbai
a,0.0,1.0,
b,,,
c,3.0,4.0,
d,6.0,7.0,


In [174]:
# Dropping entries from the axis
obj = Series(np.arange(5.),index=['a','b','c','d','e'])
nobj = obj.drop('c')
nobj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [175]:
obj.drop(['b','c'])
data = DataFrame(np.arange(16).reshape((4,4)),index = ['A','B','C','D'],columns=['One','Two','Three','Four'])
data.drop(['C','A'])

Unnamed: 0,One,Two,Three,Four
B,4,5,6,7
D,12,13,14,15


In [176]:
data.drop('Two',axis=1)

Unnamed: 0,One,Three,Four
A,0,2,3
B,4,6,7
C,8,10,11
D,12,14,15


In [177]:
data.drop(['Two','Three'],axis=1)

Unnamed: 0,One,Four
A,0,3
B,4,7
C,8,11
D,12,15


In [179]:
# Indexing - Selection - Filtering
obj = Series(np.arange(4.0),index=['a','b','c','d'])
print(obj)
print(obj['c'])
print(obj[1])

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
2.0
1.0


In [180]:
print(obj[2:3])

c    2.0
dtype: float64


In [181]:
obj[['b','c','d']]

b    1.0
c    2.0
d    3.0
dtype: float64

In [182]:
print(obj[[1,3]])
print(obj[obj>2])
print(obj['b':'c'])
obj['b':'c'] = 5
print(obj)

b    1.0
d    3.0
dtype: float64
d    3.0
dtype: float64
b    1.0
c    2.0
dtype: float64
a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64


In [183]:
data

Unnamed: 0,One,Two,Three,Four
A,0,1,2,3
B,4,5,6,7
C,8,9,10,11
D,12,13,14,15


In [186]:
print(data['Two'])
print(data[['Three','Four']])
print(data[:2])
print(data[data['Three']>5])

A     1
B     5
C     9
D    13
Name: Two, dtype: int64
   Three  Four
A      2     3
B      6     7
C     10    11
D     14    15
   One  Two  Three  Four
A    0    1      2     3
B    4    5      6     7
   One  Two  Three  Four
B    4    5      6     7
C    8    9     10    11
D   12   13     14    15


In [192]:
print(data < 5)
data[data<5]=0
print(data)
print(data.loc['B',['Two','Four']])
print(data.ix[['A','B'],[3,0,1]])


     One    Two  Three   Four
A   True   True   True   True
B   True  False  False  False
C  False  False  False  False
D  False  False  False  False
   One  Two  Three  Four
A    0    0      0     0
B    0    5      6     7
C    8    9     10    11
D   12   13     14    15
Two     5
Four    7
Name: B, dtype: int64
   Four  One  Two
A     0    0    0
B     7    0    5


In [194]:
data.iloc[2]

One       8
Two       9
Three    10
Four     11
Name: C, dtype: int64

In [195]:
data.loc[:'C','Two']

A    0
B    5
C    9
Name: Two, dtype: int64

In [198]:
data.ix[data.Three > 5,:3]

Unnamed: 0,One,Two,Three
B,0,5,6
C,8,9,10
D,12,13,14


In [200]:
# Arithmetic and Data Alignment

s1 = Series([2.5,-.4,6.0,-9.1],index=['w','x','y','z'])
s2 = Series([4.5,-9.2,2.0,-4.1,8.3],index=['a','b','x','y','z'])
print(s1)
print(s2)
print(s1+s2)
df1 = DataFrame(np.arange(9.0).reshape((3,3)),columns=list('bde'),index=['F','U','C'])
df2 = DataFrame(np.arange(12.0).reshape((4,3)),columns=list('bcd'),index=['F','C','K','U'])
print(df1)
print(df2)
print(df1 + df2)

w    2.5
x   -0.4
y    6.0
z   -9.1
dtype: float64
a    4.5
b   -9.2
x    2.0
y   -4.1
z    8.3
dtype: float64
a    NaN
b    NaN
w    NaN
x    1.6
y    1.9
z   -0.8
dtype: float64
     b    d    e
F  0.0  1.0  2.0
U  3.0  4.0  5.0
C  6.0  7.0  8.0
     b     c     d
F  0.0   1.0   2.0
C  3.0   4.0   5.0
K  6.0   7.0   8.0
U  9.0  10.0  11.0
      b   c     d   e
C   9.0 NaN  12.0 NaN
F   0.0 NaN   3.0 NaN
K   NaN NaN   NaN NaN
U  12.0 NaN  15.0 NaN


In [201]:
df1 = DataFrame(np.arange(12.0).reshape((3,4)),columns=list('abcd'))
df2 = DataFrame(np.arange(20.0).reshape((4,5)),columns=list('abcde'))
print(df1)
print(df2)
print(df1 + df2)


     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   6.0   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0  11.0  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN


In [202]:
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [203]:
df1.reindex(columns=df2.columns,fill_value=0)


Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [204]:
# Operations between Series and DataFrame

arr = np.arange(12.).reshape((3,4))
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [205]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [206]:
print(arr - arr[0])
frame = DataFrame(np.arange(16).reshape((4,4)),columns=list('abcd'),index=['W','X','Y','Z'])
series = frame.iloc[0]
print(frame)
print(series)

[[ 0.  0.  0.  0.]
 [ 4.  4.  4.  4.]
 [ 8.  8.  8.  8.]]
    a   b   c   d
W   0   1   2   3
X   4   5   6   7
Y   8   9  10  11
Z  12  13  14  15
a    0
b    1
c    2
d    3
Name: W, dtype: int64


In [207]:
print(frame - series)

    a   b   c   d
W   0   0   0   0
X   4   4   4   4
Y   8   8   8   8
Z  12  12  12  12


In [210]:
series2 = Series(range(3),index=['a','b','c'])
frame + series2

Unnamed: 0,a,b,c,d
W,0.0,2.0,4.0,
X,4.0,6.0,8.0,
Y,8.0,10.0,12.0,
Z,12.0,14.0,16.0,


In [211]:
series3 = frame['d']
print(series3)
print(frame)

W     3
X     7
Y    11
Z    15
Name: d, dtype: int64
    a   b   c   d
W   0   1   2   3
X   4   5   6   7
Y   8   9  10  11
Z  12  13  14  15


In [212]:
frame.sub(series3,axis=0)

Unnamed: 0,a,b,c,d
W,-3,-2,-1,0
X,-3,-2,-1,0
Y,-3,-2,-1,0
Z,-3,-2,-1,0


In [213]:
# Function Application and Mapping

frame = DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
print(frame)
print(np.abs(frame))

               b         d         e
Utah    1.414486 -2.446000  1.467773
Ohio   -0.348590 -0.975864 -1.630207
Texas   1.234975 -0.249116 -0.064379
Oregon -1.266157  0.184996 -0.781492
               b         d         e
Utah    1.414486  2.446000  1.467773
Ohio    0.348590  0.975864  1.630207
Texas   1.234975  0.249116  0.064379
Oregon  1.266157  0.184996  0.781492


In [214]:
f = lambda x: x.max() - x.min()
print(frame.apply(f))
print(frame.apply(f,axis=1))

b    2.680643
d    2.630997
e    3.097980
dtype: float64
Utah      3.913773
Ohio      1.281617
Texas     1.484091
Oregon    1.451153
dtype: float64


In [215]:
def f(x):
    return Series([x.min(),x.max()],index=['min','max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-1.266157,-2.446,-1.630207
max,1.414486,0.184996,1.467773


In [216]:
format = lambda x: '%.2f' %x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,1.41,-2.45,1.47
Ohio,-0.35,-0.98,-1.63
Texas,1.23,-0.25,-0.06
Oregon,-1.27,0.18,-0.78


In [217]:
frame['e'].map(format)

Utah       1.47
Ohio      -1.63
Texas     -0.06
Oregon    -0.78
Name: e, dtype: object

In [226]:
# Sorting and Ranking

obj = Series(range(4),index=['d','a','b','c'])
print(obj.sort_index())

frame  = DataFrame(np.arange(8).reshape((2,4)),index=['Three','Two'],columns=['d','a','b','c'])
print(frame.sort_index())
print(frame.sort_index(axis=1))
print(frame.sort_index(axis=1,ascending=False))

#obj2 = Series([4,7,-3,2])
#print(obj2.order)

#obj3 = Series([4,np.nan,7,np.nan,-3,2])
#print(obj3.order)


frame = DataFrame({'b':[4,6,8,-4,-7],'a':[0,5,-4,-2,-5]})
print(frame)

print(frame.sort_index(by='b'))

a    1
b    2
c    3
d    0
dtype: int64
       d  a  b  c
Three  0  1  2  3
Two    4  5  6  7
       a  b  c  d
Three  1  2  3  0
Two    5  6  7  4
       d  c  b  a
Three  0  3  2  1
Two    4  7  6  5
   a  b
0  0  4
1  5  6
2 -4  8
3 -2 -4
4 -5 -7
   a  b
4 -5 -7
3 -2 -4
0  0  4
1  5  6
2 -4  8




In [230]:
print(frame.sort_index(by=['a','b']))

   a  b
4 -5 -7
2 -4  8
3 -2 -4
0  0  4
1  5  6


  """Entry point for launching an IPython kernel.


In [231]:
obj = Series([3.5,-0.5,-.8,1.6,5.8,6.1])
obj.rank()
print(obj.rank(method='first'))
print(obj.rank(ascending=False,method='max'))
frame = DataFrame({'b':[4.3,4.2,5.8,-5.5],'a':[2.5,4.6,-5.8,9.2],'c':[5.6,8.3,0.8,-9.1]})

print(frame)
print(frame.rank(axis=1))

0    4.0
1    2.0
2    1.0
3    3.0
4    5.0
5    6.0
dtype: float64
0    3.0
1    5.0
2    6.0
3    4.0
4    2.0
5    1.0
dtype: float64
     a    b    c
0  2.5  4.3  5.6
1  4.6  4.2  8.3
2 -5.8  5.8  0.8
3  9.2 -5.5 -9.1
     a    b    c
0  1.0  2.0  3.0
1  2.0  1.0  3.0
2  1.0  3.0  2.0
3  3.0  2.0  1.0


In [232]:
obj = Series(range(5),index=['a','a','b','b','c'])
print(obj)
print(obj.index.is_unique)
print(obj['a'])
print(obj['c'])
df = DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
print(df)
print(df.ix['b'])

a    0
a    1
b    2
b    3
c    4
dtype: int64
False
a    0
a    1
dtype: int64
4
          0         1         2
a  0.973285  0.527943  0.246640
a -0.197069  0.627520  0.428421
b -0.045173 -1.192452 -0.233081
b  0.001128  0.328617  0.233689
          0         1         2
b -0.045173 -1.192452 -0.233081
b  0.001128  0.328617  0.233689


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  
