# Pandas Tutorial

In [1]:
# Importing pandas

import pandas as pd
import numpy as np

In [2]:
# Checking pandas version
pd.__version__

'1.1.3'

In [3]:
# Package description
pd?

[0;31mType:[0m        module
[0;31mString form:[0m <module 'pandas' from '/home/luan/anaconda3/lib/python3.8/site-packages/pandas/__init__.py'>
[0;31mFile:[0m        ~/anaconda3/lib/python3.8/site-packages/pandas/__init__.py
[0;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point data

# Introducing Pandas Objects

## Series

In [4]:
data = pd.Series([1,2,3,4,5])
data

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
# Obtaining Series values
print(data.values)

[1 2 3 4 5]


In [6]:
# Obtaining the index
data.index


RangeIndex(start=0, stop=5, step=1)

In [7]:
#Accessing data
print('Element {} from Series: {}\n'.format(0,data[0]))
# Using slices/Fatiando o array
print('Series Elements:\n')
print(data[0:4])

Element 0 from Series: 1

Series Elements:

0    1
1    2
2    3
3    4
dtype: int64


## Series as a generalization of numpy array

In [8]:
data = pd.Series([5,10,15,20,25],index = ['a','b','c','d','e'])
data

a     5
b    10
c    15
d    20
e    25
dtype: int64

In [9]:
# Accessing value by index
data['a']

5

In [10]:
# The indices can be non sequential
data.index = ['a',1,'b',2,'c']
print(data['a'],data[1],data['b'],data[2],data['c'])

5 10 15 20 25


### Series as specialized dictionary

In [11]:
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)

In [12]:
population['California']

38332521

In [13]:
#With Series is possible to perform array slicing in non-numeric data
population['California':'Florida']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64

In [14]:
# By default, index is a integer sequence
pd.Series(range(2,7,2))

0    2
1    4
2    6
dtype: int64

In [15]:
# If data is a single value and multiple indices are given, the value is repeated to fill the indexes
pd.Series(10,index = range(3))

0    10
1    10
2    10
dtype: int64

In [16]:
# As seen before, data can be a dictionary, and its keys can be choosed by informing it in index argument
pd.Series({2:'a',1:'b',3:'c'},index = [3,2])


3    c
2    a
dtype: object

## The Pandas DataFrame Object

In [17]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}

In [18]:
area = pd.Series(area_dict)

In [19]:
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [20]:
states = pd.DataFrame({'population':population,'area':area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [21]:
#accessing the indices:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [22]:
#acessing the columns
states.columns

Index(['population', 'area'], dtype='object')

### DataFrame as specialized dictionary.

In [23]:
print('state área: \n')
states['area']

state área: 



California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### Constructing DataFrame objects

In [24]:
# From a single series
pd.DataFrame(population,columns = ['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [25]:
# From a list of dicts

data = [{'a':i,'b':2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [26]:
# If some keys in the dictionary are missing, Pandas will fill them with NaN

pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [27]:
# From a dictionary of Series objects
pd.DataFrame({'population': population,
'area': area})

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [28]:
# From a NumPy structured array
pd.DataFrame(np.random.rand(3,2),columns = ['foo','bar'],index = ['a','b','c'])

Unnamed: 0,foo,bar
a,0.744678,0.623582
b,0.092961,0.532343
c,0.029457,0.884964


In [29]:
# From a NumPy structured array
A = np.zeros(3,dtype = [('A','i8'),('B','f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [30]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


## Pandas Index Object

In [31]:
# Index => immutable array or ordered set

In [32]:
ind = pd.Index([2,3,4,5,7,11])

### Index as immutable array

In [33]:
# its access is equal to ordinary lists
ind[1]

3

In [34]:
# Is also possible to perform array slicing
ind[::-1]

Int64Index([11, 7, 5, 4, 3, 2], dtype='int64')

In [35]:
# The main difference it is the fact that the Index object are immutable
ind[1]=0

TypeError: Index does not support mutable operations

### Index as ordered set/Index as ordered set

In [72]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])
print('Intersection: ',indA & indB) #intersection
print('Union: ',indA | indB)
print('Symmetric Difference: ',indA^indB)

Intersection:  Int64Index([3, 5, 7], dtype='int64')
Union:  Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Symmetric Difference:  Int64Index([1, 2, 9, 11], dtype='int64')


In [73]:
#Using methods
print('Intersection:    ',indA.intersection(indB))
print('Union:   ',indA.union(indB))
print('Symmetric Difference:    ',indA.symmetric_difference(indB))

Intersection:     Int64Index([3, 5, 7], dtype='int64')
Union:    Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Symmetric Difference:     Int64Index([1, 2, 9, 11], dtype='int64')


# Data Indexing and Selection

In [74]:
data = pd.Series([0.25,0.5,0.75,1],index = ['a','b','c','d'])

In [75]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [76]:
# is possible to check indices using dictionary like expressions
print('a' in data)
print(data.keys())
print(list(data.items()))

True
Index(['a', 'b', 'c', 'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]


## Series as one-dimensional array

In [77]:
# slicing is possible by explicit index(inclusive)
print(data['a':'c'])
# is also possible to slice by implicit integer index(exclusive)
print(data[0:2])
# masking
print(data[(data>0.3)&(data<0.6)])

a    0.25
b    0.50
c    0.75
dtype: float64
a    0.25
b    0.50
dtype: float64
b    0.5
dtype: float64


## indexers: loc, iloc e ix:


In [78]:
# To avoid confusion between implict and explicit indexing, pandas provides indexer attributes that exposes the indexing schemes
data = pd.Series(['a','b','c'],index = [1,3,5])
# explicit index when indexing
print('indexing:\n',data[1])
# implicit index when slicing
print('slicing:\n',data[1:3])

indexing:
 a
slicing:
 3    b
5    c
dtype: object


In [79]:
#.loc: explicit indexing
# iloc: implicit indexing
print(data.loc[1])
print(data.loc[1:5])

a
1    a
3    b
5    c
dtype: object


In [80]:
print(data.iloc[1])
print(data.iloc[1:5])

b
3    b
5    c
dtype: object


# Data Selection in DataFrame

## DataFrame as a dictionary

In [95]:
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [96]:
# Access like dictionary
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [97]:
# Access like class
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [98]:
# those methods are equivalent
data.area is data['area']

True

In [99]:
data.pop is data['pop']

False

In [102]:
# is possible to add another column by using a
# dictionary-like structure
data['density'] = data['pop']/data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


## DataFrame as two-dimensional array

In [105]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [106]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [107]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [111]:
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [112]:
data.loc[:'Illinois',:'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [115]:
data.loc[data.density>100,['pop','density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [120]:
print(data)
data.iloc[0,2] = 90
print('\n')
print('After the indexing assignment: \n')
print(data)

              area       pop     density
California  423967  38332521   90.000000
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


After the indexing assignment: 

              area       pop     density
California  423967  38332521   90.000000
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


## Additional indexing conventions

In [129]:
print('slicing refers to rows: \n')
print(data['Florida':'Illinois'])
print('\n')
print(data[3:])
print('\n')
print(data[data.density>100])

slicing refers to rows: 

            area       pop     density
Florida   170312  19552860  114.806121
Illinois  149995  12882135   85.883763


            area       pop     density
Florida   170312  19552860  114.806121
Illinois  149995  12882135   85.883763


            area       pop     density
New York  141297  19651127  139.076746
Florida   170312  19552860  114.806121


# Operating on Data in Pandas

## Ufuncs: Index Preservation

In [37]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [38]:
df = pd.DataFrame(rng.randint(0,10,(3,4)),columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [41]:
#applying ufuncs to series also returns a series object
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [42]:
# The same holds to dataframes
np.sin(df*np.pi/4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


## UFuncs: Index Alignment

### Index alignment in Series

In [45]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127}, name='population')

In [49]:
# Calculating the density
population/area
# Note that this operation returns the union of
# both arrays.
# Obs: items without reference in any of the arrays are filled with NaNs

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [48]:
# Checking the union set
area.index | population.index

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [51]:
# Another example
A = pd.Series(np.random.randint(1,10,3),index = [0,1,2])
B = pd.Series(np.random.randint(1,10,3),index = [1,2,3])
A+B

0     NaN
1     8.0
2    14.0
3     NaN
dtype: float64

In [64]:
#To fill NaN values, one should use:
print('A = \n')
print(A)
print('\n')
print('B = \n')
print(B)
print('\n')
print('A+B = \n')
A.add(B,fill_value = 0)

A = 

0    6
1    3
2    7
dtype: int64


B = 

1    5
2    7
3    4
dtype: int64


A+B = 



0     6.0
1     8.0
2    14.0
3     4.0
dtype: float64

## Index Alignment in DataFrame

In [38]:
A = pd.DataFrame(np.random.randint(0, 20, (2, 2)),
columns=list('AB'))

In [40]:
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)),
columns=list('BAC'))

In [41]:
A+B

Unnamed: 0,A,B,C
0,20.0,21.0,
1,8.0,14.0,
2,,,


In [46]:
fill = A.stack().mean()
A.add(B,fill_value = fill)

Unnamed: 0,A,B,C
0,20.0,21.0,14.0
1,8.0,14.0,16.0
2,20.0,19.0,15.0


### Ufuncs: Operations Between DataFrames and Series

In [48]:
A = np.random.randint(10,size = (3,4))
df = pd.DataFrame(A,columns = list('QRST'))

In [51]:
#subtracting the dataframe by the 1st row
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-6,-1,-1,0
2,-2,4,4,-2


In [52]:
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,4,0,3,4
1,-1,0,3,5
2,-2,0,3,-2


In [53]:
halfrow = df.iloc[0, ::2]
halfrow

Q    6
S    5
Name: 0, dtype: int64

In [54]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-6.0,,-1.0,
2,-2.0,,4.0,


## Missing Data in Pandas

In [55]:
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [56]:
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

dtype = object
68.9 ms ± 3.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype = int
2.5 ms ± 181 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)



In [57]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [60]:
vals2 = np.array([1,np.nan,3,4])
vals2.dtype

dtype('float64')

In [63]:
# Nan  with other operations
print(1+np.nan)
print(0*np.nan)
print(vals2.sum())
print(vals2.max())
print(vals2.min())

nan
nan
nan
nan
nan


In [64]:
# Nan safe operations
np.nansum(vals2),np.nanmin(vals2),np.nanmax(vals2)


(8.0, 1.0, 4.0)

### NaN and None in Pandas

In [65]:
pd.Series([1,np.nan,2,None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [66]:
x = pd.Series(range(2), dtype=int)
x

0    0
1    1
dtype: int64

In [67]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

In [69]:
'''
Pandas handling of NAs by type

Typeclass |  Conversion  |    sentinel value
---------------------------------------------
floating  |  No change   |    np.nan
object    |  No change   |    None or np.nan
integer   |  float64     |    np.nan
boolean   |  object      |    None or np.nan

'''

'\nPandas handling of NAs by type\n\nTypeclass |  Conversion  |    sentinel value\n---------------------------------------------\nfloating  |  No change   |    np.nan\nobject    |  No change   |    None or np.nan\ninteger   |  float64     |    np.nan\nboolean   |  object      |    None or np.nan\n\n'

In [None]:
'''
isnull(): Generate a Boolean mask indicating missing values

notnull(): Opposite of isnull()

fillna():  Return a copy of the data with missing
values filled or imputed

dropna():  Return a filtered version of the data



'''

### Detecting null values

In [46]:
data = pd.Series([1,np.nan,'hello',None])
# isnull()
print('Using isnull():\n')

print(data.isnull())l

Using isnull():

0    False
1     True
2    False
3     True
dtype: bool


In [43]:
# Masking with isnull()
print("Null data :\n")
data[data.isnull()]

Null data :



1     NaN
3    None
dtype: object

In [44]:
# Masking with notnull()
print("Not Null data: \n")
print(data[data.notnull()])

Not Null data: 

0        1
2    hello
dtype: object


### Dropping null values

In [47]:
data.dropna()

0        1
2    hello
dtype: object

In [49]:
# DataFrame example
df = pd.DataFrame([[1,
np.nan, 2],
[2,
3,
5],
[np.nan, 4,
6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [52]:
# dropna removes any line or row with any NaN occurence by default
print("Row case: \n")
print(df.dropna())
print("\n")
print("Column case: \n")
print(df.dropna(axis = 'columns'))

Row case: 

     0    1  2
1  2.0  3.0  5


Column case: 

   2
0  2
1  5
2  6


In [78]:
# However, is possible to tune dropna parameters
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [79]:
#Ex1
df.dropna(axis = 'columns',how = 'all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [80]:
#Ex2
df.dropna(axis = 'rows',thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


### Filling null values

In [81]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [82]:
# Method 1
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [83]:
# Method 2
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [84]:
#Method 3
data.fillna(method = 'bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [86]:
# DataFrame examples
#Ex1(remember to specify an axis!!!!)
df.fillna(method = 'ffill',axis = 1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


## Hierarhical Indexing

In [89]:
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]
# the bad way....
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [90]:
# Slicing......
pop[('California', 2010):('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [91]:
# Selecting 2010 data (compromises readability)
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

## The better way: Pandas Multindex

In [93]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [97]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [98]:
# Accessing populations in 2010
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [99]:
# Converting an multindex Series into a DataFrame
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [155]:
#stack is the inverse operation
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [157]:
pop_df = pd.DataFrame({'total': pop,
'under18': [9267089, 9284094,
4687374, 4318033,
5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [159]:
f_u18 = pop_df['under18']/pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


## Methods of MultiIndex Creation

In [166]:
# Method 1
df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.582391,0.657958
a,2,0.420886,0.787493
b,1,0.339047,0.404865
b,2,0.620584,0.379435


In [167]:
# Method 2: with dictionaries
data = {('California', 2000): 33871648,
('California', 2010): 37253956,
('Texas', 2000): 20851820,
('Texas', 2010): 25145561,
('New York', 2000): 18976457,
('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

### Explicit MultiIdex Constructors

In [170]:
# Using Arrays
pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [171]:
# Using Tuples
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [172]:
# Using cartesian product
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [177]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [189]:
# From DataFrame
df = pd.DataFrame([['a', '1'], ['a','2'],['b', '1'], ['b', '2']],columns=['a', 'b'])
pd.MultiIndex.from_frame(df)

MultiIndex([('a', '1'),
            ('a', '2'),
            ('b', '1'),
            ('b', '2')],
           names=['a', 'b'])

### MultiIndex level names

In [193]:
pop.index.names = ['state','year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

### MultiIndex for columns

In [194]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])

In [197]:
index,columns

(MultiIndex([(2013, 1),
             (2013, 2),
             (2014, 1),
             (2014, 2)],
            names=['year', 'visit']),
 MultiIndex([(  'Bob',   'HR'),
             (  'Bob', 'Temp'),
             ('Guido',   'HR'),
             ('Guido', 'Temp'),
             (  'Sue',   'HR'),
             (  'Sue', 'Temp')],
            names=['subject', 'type']))

In [198]:
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

In [199]:
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,54.0,35.2,37.0,37.9,33.0,35.5
2013,2,50.0,37.2,38.0,38.0,32.0,38.8
2014,1,42.0,37.9,38.0,37.8,28.0,39.1
2014,2,53.0,36.7,7.0,36.3,40.0,37.1


In [201]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,37.0,37.9
2013,2,38.0,38.0
2014,1,38.0,37.8
2014,2,7.0,36.3


## Indexing and Slicing a MultiIndex

### Multiply Indexed Series

In [202]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [206]:
# Population in California, year 2000
pop['California',2000]

33871648

In [210]:
#Partial slicing - California and New York in 2000
pop.loc['California':'New York',2000]

state       year
California  2000    33871648
New York    2000    18976457
dtype: int64

In [211]:
# All states in 2000:
pop[:,2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [212]:
pop[pop>22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [216]:
# Fancy indexing
pop[['California','Texas']]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

### Multiple indexed DataFrames

In [220]:
health_data['Guido','HR']

year  visit
2013  1        37.0
      2        38.0
2014  1        38.0
      2         7.0
Name: (Guido, HR), dtype: float64

In [230]:
health_data.iloc[:2,:2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,54.0,35.2
2013,2,50.0,37.2


In [232]:
health_data.loc[:, ('Bob', 'Temp')]

year  visit
2013  1        35.2
      2        37.2
2014  1        37.9
      2        36.7
Name: (Bob, Temp), dtype: float64

In [239]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,54.0,37.0,33.0
2014,1,42.0,38.0,28.0


### Sorted and unsorted indices

In [250]:
# Many of the multiIndex slicing will fail if the index is not sorted
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.585533
      2      0.588832
c     1      0.565968
      2      0.743862
b     1      0.735340
      2      0.579784
dtype: float64

In [245]:
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [254]:
data = data.sort_index()
data

char  int
a     1      0.585533
      2      0.588832
b     1      0.735340
      2      0.579784
c     1      0.565968
      2      0.743862
dtype: float64

In [255]:
# With sorted data, slicing works
data['a':'b']

char  int
a     1      0.585533
      2      0.588832
b     1      0.735340
      2      0.579784
dtype: float64

### Stacking and Unstacking indices

In [275]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [264]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


### Index setting and resetting

In [267]:
pop_flat = pop.reset_index(name = 'population')
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [268]:
pop_flat.set_index(['state','year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


## Data Aggregations on Multi-Indices

In [272]:
data_mean = health_data.mean(level = 'year')
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,52.0,36.2,37.5,37.95,32.5,37.15
2014,47.5,37.3,22.5,37.05,34.0,38.1


In [279]:
data_mean = health_data.mean(axis= 1,level = 'subject')
data_mean

Unnamed: 0_level_0,subject,Bob,Guido,Sue
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,1,44.6,37.45,34.25
2013,2,43.6,38.0,35.4
2014,1,39.95,37.9,33.55
2014,2,44.85,21.65,38.55


## Combining Datasets: Concat and Append

In [281]:
def make_df(cols,ind):
    "Quickly make a DataFrame"
    data = {c: [str(c) + str(i) for i in ind]
    for c in cols}
    return pd.DataFrame(data, ind)

In [286]:
ser1 = pd.Series(['A','B','C'],index = [1,2,3])
ser2 = pd.Series(['D','E','F'],index = [4,5,6])
pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [287]:
# This also works with dataframes
df1 = make_df('AB',[1,2])
df2 = make_df('AB',[3,4])
print(df1);print(df2);print(pd.concat([df1,df2]))

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [289]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3); print(df4); print(pd.concat([df3, df4], axis=1))

    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1


In [298]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index

In [299]:
pd.concat([x,y])

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


In [301]:
#Catching the repeats as an error
try:
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


In [303]:
# We can also skip the repeated index 
print(x); print(y); print(pd.concat([x, y], ignore_index=True))

    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
    A   B
0  A0  B0
1  A1  B1
2  A2  B2
3  A3  B3


In [307]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5); 
print(df6);
print(pd.concat([df5, df6]))

    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4
