# Introducing pandas object

In [2]:
import numpy as np
import pandas as pd

In [4]:
data = pd.Series([0.25,0.5,0.75,1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [10]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [12]:
data[1]

0.5

In [14]:
data[1:3]

1    0.50
2    0.75
dtype: float64

# Series as generalized numpy array

In [5]:
data = pd.Series([0.25,0.5,0.75,1.0], index = [ 'a','b','c','d'])
data


a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [19]:
data['b']

0.5

In [6]:
data = pd.Series([0.25,0.50,0.75,1.00], index = [2,3,5,7])
data

2    0.25
3    0.50
5    0.75
7    1.00
dtype: float64

In [23]:
data[5]

0.75

# Series as specialized dictionary

In [7]:
population_dict = {'California' : 388233457,
                   'Texas' : 45783920,
                   'New York' : 23553759,
                   'Florida' : 345627956,
                   'Illinois' : 1234356738}
population = pd.Series(population_dict)
population

California     388233457
Texas           45783920
New York        23553759
Florida        345627956
Illinois      1234356738
dtype: int64

In [44]:
population['California']

388233457

In [45]:
population['California' : 'Florida']

California    388233457
Texas          45783920
New York       23553759
Florida       345627956
dtype: int64

# Constructing series object

In [8]:
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [47]:
pd.Series(5, index = [100,200,300])

100    5
200    5
300    5
dtype: int64

In [48]:
pd.Series({2 : 'a', 3 : 'b', 1 : 'c'})

2    a
3    b
1    c
dtype: object

In [49]:
pd.Series({2 : 'a', 3 : 'b', 4 : 'c'}, index = [3,2])

3    b
2    a
dtype: object

# Pandas dataframe object

# Dataframe as generalized numpy array

In [9]:
area_dict = {'California' : 425275,
            'Texas' : 233478,
            'New York' : 126748,
            'Florida' : 456270,
            'Illinois' : 132546}
area = pd.Series(area_dict)
area

California    425275
Texas         233478
New York      126748
Florida       456270
Illinois      132546
dtype: int64

In [10]:
status = pd.DataFrame({'Population' : population, 'Area' : area})
status

Unnamed: 0,Population,Area
California,388233457,425275
Texas,45783920,233478
New York,23553759,126748
Florida,345627956,456270
Illinois,1234356738,132546


In [55]:
status.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [57]:
status.columns

Index(['Population', 'Area'], dtype='object')

# DataFrame as specialized dictionary

In [60]:
status['Area']

California    425275
Texas         233478
New York      126748
Florida       456270
Illinois      132546
Name: Area, dtype: int64

# Construction DataFrame objects

# From a single series object

In [11]:
pd.DataFrame(population, columns = ['Population'])

Unnamed: 0,Population
California,388233457
Texas,45783920
New York,23553759
Florida,345627956
Illinois,1234356738


# From a list of dictionary


In [64]:
data = [{'a' : i, 'b' : 2*i}
       for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [67]:
pd.DataFrame([{'a' : 1, 'b' : 2}, {'b' : 3, 'c' : 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


# From a two dimensional numpy array

In [12]:
pd.DataFrame(np.random.rand(3,2),
            columns = ['foo','bar'],
            index = ['a','b','c'])

Unnamed: 0,foo,bar
a,0.01406,0.812355
b,0.820021,0.949485
c,0.487859,0.964397


# From a numpy structured array

In [71]:
A = np.zeros(3, dtype = [('A', 'i8'),('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [74]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


# The dataframe index object

In [76]:
ind = pd.Index([2,3,5,7])
ind

Int64Index([2, 3, 5, 7], dtype='int64')

In [78]:
ind[1]

3

In [80]:
ind[::2]

Int64Index([2, 5], dtype='int64')

In [82]:
print(ind.size,ind.shape,ind.ndim,ind.dtype)

4 (4,) 1 int64


In [84]:
ind[1] = 0
ind

TypeError: Index does not support mutable operations

# Index as ordered set

In [86]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])
indA & indB

  indA & indB


Int64Index([3, 5, 7], dtype='int64')

In [88]:
indA|indB

  indA|indB


Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [90]:
indA^indB

  indA^indB


Int64Index([1, 2, 9, 11], dtype='int64')

In [92]:
indA.intersection(indB)

Int64Index([3, 5, 7], dtype='int64')

# Data indexing and selection

# Data selection in series

In [13]:
import pandas as pd
data = pd.Series([0.25,0.500,0.75,1.00], index = ['a','b','c','d'])
data


a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [96]:
data['b']

0.5

In [98]:
'a' in data

True

In [100]:
data.keys

<bound method Series.keys of a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64>

In [102]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [104]:
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

# Series as one - dimensional array

In [106]:
data['a' : 'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [108]:
data[0:2]

a    0.25
b    0.50
dtype: float64

In [110]:
data[(data >0.3) & (data<0.8)]

b    0.50
c    0.75
dtype: float64

# Indexers : loc,iloc and ix

In [14]:
import pandas as pd
import numpy as np
data = pd.Series(['a','b','c'], index = [1,3,5])
data

1    a
3    b
5    c
dtype: object

In [15]:
data[1:3]

3    b
5    c
dtype: object

In [16]:
data.loc[1]

'a'

In [17]:
data.loc[1:3]

1    a
3    b
dtype: object

In [18]:
data = pd.Series(['a','b','c','d','e','f'], index = [1,2,3,4,5,6])
data

1    a
2    b
3    c
4    d
5    e
6    f
dtype: object

In [19]:
data.loc[1:5] # Explicit index

1    a
2    b
3    c
4    d
5    e
dtype: object

In [20]:
data.iloc[:5] # implicit index

1    a
2    b
3    c
4    d
5    e
dtype: object

In [21]:
data.iloc[::2]

1    a
3    c
5    e
dtype: object

# Data selection in dataframe

In [22]:
data = pd.DataFrame({'Area' : area, 'Population' : population})
data

Unnamed: 0,Area,Population
California,425275,388233457
Texas,233478,45783920
New York,126748,23553759
Florida,456270,345627956
Illinois,132546,1234356738


In [131]:
data['Area']

California    425275
Texas         233478
New York      126748
Florida       456270
Illinois      132546
Name: Area, dtype: int64

In [134]:
data.Area

California    425275
Texas         233478
New York      126748
Florida       456270
Illinois      132546
Name: Area, dtype: int64

In [136]:
data.Population

California     388233457
Texas           45783920
New York        23553759
Florida        345627956
Illinois      1234356738
Name: Population, dtype: int64

In [138]:
data.Area is data['Area']

True

In [140]:
data.Population is data['Population']

True

In [23]:
data['density'] = data['Population']/data['Area']
data

Unnamed: 0,Area,Population,density
California,425275,388233457,912.899787
Texas,233478,45783920,196.095221
New York,126748,23553759,185.831406
Florida,456270,345627956,757.50752
Illinois,132546,1234356738,9312.666833


# Dataframe as two dimensional array

In [24]:
data.values

array([[4.25275000e+05, 3.88233457e+08, 9.12899787e+02],
       [2.33478000e+05, 4.57839200e+07, 1.96095221e+02],
       [1.26748000e+05, 2.35537590e+07, 1.85831406e+02],
       [4.56270000e+05, 3.45627956e+08, 7.57507520e+02],
       [1.32546000e+05, 1.23435674e+09, 9.31266683e+03]])

In [25]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
Area,425275.0,233478.0,126748.0,456270.0,132546.0
Population,388233500.0,45783920.0,23553760.0,345628000.0,1234357000.0
density,912.8998,196.0952,185.8314,757.5075,9312.667


In [26]:
data.iloc[:3,:2]

Unnamed: 0,Area,Population
California,425275,388233457
Texas,233478,45783920
New York,126748,23553759


In [27]:
data.loc[:'Illinois',:'Population']

Unnamed: 0,Area,Population
California,425275,388233457
Texas,233478,45783920
New York,126748,23553759
Florida,456270,345627956
Illinois,132546,1234356738


In [32]:
data.ix[:3,'Population'] # Based on position as well as names


AttributeError: 'DataFrame' object has no attribute 'ix'

In [34]:
data.loc[data.density > 100, ['Population','density']]

Unnamed: 0,Population,density
California,388233457,912.899787
Texas,45783920,196.095221
New York,23553759,185.831406
Florida,345627956,757.50752
Illinois,1234356738,9312.666833


In [37]:
data.iloc[0,2] = 90
data

Unnamed: 0,Area,Population,density
California,425275,388233457,90.0
Texas,233478,45783920,196.095221
New York,126748,23553759,185.831406
Florida,456270,345627956,757.50752
Illinois,132546,1234356738,9312.666833


# Additional indexing conventions

In [39]:
data['Florida' : 'Illinois']

Unnamed: 0,Area,Population,density
Florida,456270,345627956,757.50752
Illinois,132546,1234356738,9312.666833


In [41]:
data[1:3]

Unnamed: 0,Area,Population,density
Texas,233478,45783920,196.095221
New York,126748,23553759,185.831406


In [43]:
data[data.density >100]

Unnamed: 0,Area,Population,density
Texas,233478,45783920,196.095221
New York,126748,23553759,185.831406
Florida,456270,345627956,757.50752
Illinois,132546,1234356738,9312.666833


# Handling missing data

# Missing data in pandas

In [45]:
import numpy as np
import pandas as pd
vals1 = np.array([1,None,3,4])
vals1

array([1, None, 3, 4], dtype=object)

In [47]:
for dtype in ['object','int']:
    print('dtype :', dtype)

dtype : object
dtype : int


# NaN : Missing numerical data

In [50]:
vals2 = np.array([1,np.nan,3,4])
vals2

array([ 1., nan,  3.,  4.])

In [52]:
vals2.sum(),vals2.min(),vals2.max()

(nan, nan, nan)

# Nan and None in pandas

In [54]:
pd.Series([1,np.nan,2,None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [56]:
x = pd.Series(range(2), dtype = int)
x

0    0
1    1
dtype: int32

In [58]:
x[0]

0

# Operating on null values

# isnull()
# notnull()
# dropna()
# fillna()

# Detecting null values

In [60]:
data = pd.Series([1,np.nan,'hello',None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [62]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [64]:
data.dropna()

0        1
2    hello
dtype: object

In [66]:
df = pd.DataFrame([[1,np.nan,2],
                  [2,3,5],
                  [np.nan,4,6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [70]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [72]:
df.dropna(axis = 'columns')

Unnamed: 0,2
0,2
1,5
2,6


In [74]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [77]:
df.dropna(axis = 'columns', how = 'all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [79]:
df.dropna(axis = 'rows', thresh = 3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


# Filling null values

In [80]:
data = pd.Series([1,np.nan,2,None,3], index = list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [82]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [84]:
data.fillna(method = 'ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [None]:
data.fillna(method = )