# MultiIndex
- A MultiIndex, also known as a multi-level index or hierarchical index, allows you to have multiple columns </br>
acting as a row identifier, while having each index column related to another through a parent/child relationship.
- MultiIndex is an array of tuples where each tuple is unique.
- You can create MultiIndex from list of arrays, arry of tuples, dataframe e.t.c
- The Index constructor will attempt to return a MultiIndex when it is passed a list of tuples.
- You can have Multi-level for both Index and Column labels.
- Multi-level columns are used when you wanted to group columns together.
- The reason that the MultiIndex matters is that it can allow you to do grouping, selection, and reshaping operations

## Creating Multi-index df's

#### from_tuples

In [2]:
import pandas as pd
import numpy as np
import statistics as st

In [98]:
arrays = [
    ["A", "A", "B", "B", "C", "C", "D", "D"],
    ["one", "two", "one", "two", "one", "two", "one", "two"]]
    
tuples = list(zip(*arrays)) # list of tuples - to use as a multi-index
index = pd.MultiIndex.from_tuples(tuples, names=["category", "type"])
values = pd.Series(np.random.randn(8), index=index) # attach values to the multiindex - or the other way around
values

category  type
A         one     2.525004
          two    -0.787322
B         one     0.375261
          two    -0.170478
C         one     2.189596
          two     0.115648
D         one    -1.560539
          two    -0.681614
dtype: float64

In [163]:
# As a convenience, you can pass a list of arrays directly into Series or DataFrame to construct a MultiIndex automatically:
arrays = [
    np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]),
    np.array(["one", "two", "one", "two", "one", "two", "one", "two"]),
]

dd = pd.DataFrame(np.random.randn(8, 4), index=arrays)
dd

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,-2.944273,-0.846939,0.93937,1.561242
bar,two,-0.635602,-0.339582,0.252754,-0.150529
baz,one,-0.35896,0.056918,0.604827,-0.753004
baz,two,-0.169853,-1.040823,1.155767,-0.149891
foo,one,-0.542666,-0.935615,-1.156693,0.390834
foo,two,-0.755098,0.253837,-0.711928,-1.170645
qux,one,0.585233,-0.61671,1.141497,-0.200298
qux,two,-0.971352,0.469846,1.088747,-0.375689


#### from_product()

In [100]:
# When you want every pairing of the elements in two iterables
iterables = [["A", "B", "C", "D"], ["one", "two"]]
pd.MultiIndex.from_product(iterables, names=["category", "type"])

MultiIndex([('A', 'one'),
            ('A', 'two'),
            ('B', 'one'),
            ('B', 'two'),
            ('C', 'one'),
            ('C', 'two'),
            ('D', 'one'),
            ('D', 'two')],
           names=['category', 'type'])

#### from_frame
- construct a MultiIndex from a DataFrame directly, using the method MultiIndex.from_frame(). 
- This is a complementary method to MultiIndex.to_frame().

In [3]:
idx = pd.DataFrame([["A", "one"], ["A", "two"], ["B", "one"], ["B", "two"]], columns=["category", "type"])
idx = pd.MultiIndex.from_frame(idx)
df1= pd.DataFrame(np.random.randn(4, 4), index=idx)
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
category,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,one,0.962902,-0.782445,1.305668,-0.43495
A,two,1.625498,1.954267,-1.053287,-0.168843
B,one,0.890441,-0.689902,-2.204539,-0.718295
B,two,1.779873,-0.114838,-0.210834,1.207135


In [102]:
#  the names argument stores string names for the levels themselves
df1.index.names

FrozenList(['category', 'type'])

In [103]:
# The Multi-index can back any axis of a pandas object, and the number of levels of the index is up to you:
index2 = pd.MultiIndex.from_product([["X", "Y", "Z"], ["men", "women", "divers"]], names=["bloodtype", "sex"])
df2 = pd.DataFrame(np.random.randn(9, 8), index=index2, columns=index)
df2

Unnamed: 0_level_0,category,A,A,B,B,C,C,D,D
Unnamed: 0_level_1,type,one,two,one,two,one,two,one,two
bloodtype,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
X,men,-1.319026,-1.555636,-1.638529,1.222743,0.562824,1.24117,-1.327589,-0.97168
X,women,0.113257,0.33548,-1.151749,0.414416,-0.21489,-0.088367,-0.42618,-1.476091
X,divers,-1.584692,0.080635,-0.236892,-0.343871,-2.102431,-1.561049,0.459635,-1.393493
Y,men,0.345616,1.088824,-0.967043,0.491785,-2.310619,0.224751,2.509271,1.339698
Y,women,-0.452497,-0.13374,0.665746,1.411656,-1.442076,-0.717992,1.550954,-2.255222
Y,divers,0.351113,-0.520398,-1.068307,-1.775115,-0.740112,1.031474,1.013971,0.719363
Z,men,-0.195505,1.032582,-0.681419,1.191336,0.943443,-0.484341,-2.109193,1.3851
Z,women,-0.576821,0.221417,-2.184739,0.611495,0.524643,0.607175,-0.504465,-2.362965
Z,divers,0.204473,-1.519467,1.760388,0.305826,-0.692298,0.762941,-0.976918,0.049021


### sort_index
- to be indexed and sliced effectively, Multiindex objects need to be sorted.

In [174]:
dd
dd.sort_index() # level =0
dd.sort_index(level=1)
df2.sort_index(level="sex")
df2.sort_index(level=1, axis=1)
df2.sort_index(level=1, axis=0)
df2.index.is_monotonic_increasing # check if sorted

True

## Indexing & lables

In [104]:
# We’ve “sparsified” the higher levels of the indexes to make the console output a bit easier on the eyes. 
# Note that how the index is displayed can be controlled using the multi_sparse option in pandas.set_options():
with pd.option_context("display.multi_sparse", False): 
        print(df1)

                      0         1         2         3
category type                                        
A        one   1.101429 -0.447184  0.657339 -0.924720
A        two  -2.031742 -0.662862  0.826294  0.279883
B        one   0.581264 -1.325073  1.828181 -0.127016
B        two   0.046914  0.241372  0.787327  1.079222


In [105]:
movies = pd.read_csv("../data/movies_clean.csv")
movies_s = movies[['title', 'original_title', 'year', 'genre', 'duration', 'country', 'language',
                                    'director', 'writer', 'description', 'avg_vote', 'budget', 'metascore']]

# To create a MultiIndex with our original DataFrame, we can pass a list of columns into the .set_index() fct.
movies_mi = movies_s.set_index(['genre', 'director', 'year', 'avg_vote']).sort_index() # sorts by index - outer to inner
# .reset_index() to remove the multiindex


In [106]:
movies_mi.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,title,original_title,duration,country,language,writer,description,budget,metascore
genre,director,year,avg_vote,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Action,"A. Mahadev, Gururaj M. Desai",2016,5.5,Jaguar,Jaguar,153,India,"Kannada, Telugu","Gururaj M. Desai, A. Mahadev","Krishna, a young medical student, strikes agai...",0,0.0
Action,A. Rajdeep,2020,3.1,Asura Guru,Asura Guru,120,India,Tamil,,"Shakti, who becomes obsessed with money from c...",0,0.0


In [107]:
movies_mi.index.names  # four columns now make up the index

FrozenList(['genre', 'director', 'year', 'avg_vote'])

#### get_level_values() 
- will return a vector of the labels for each location at a particular level:

In [108]:
# returns a vector of the labels for each location at a particular level
print(movies_mi.index.get_level_values(0))
movies_mi.index.get_level_values("director")

Index(['Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action',
       'Action', 'Action', 'Action',
       ...
       'Western, Comedy', 'Western, Comedy', 'Western, Comedy',
       'Western, Comedy', 'Western, Comedy, Drama', 'Western, Drama',
       'Western, Drama', 'Western, Drama', 'Western, Family',
       'Western, Horror'],
      dtype='object', name='genre', length=85855)


Index([   'A. Mahadev, Gururaj M. Desai',                      'A. Rajdeep',
                       'A.R. Murugadoss',        'A.S. Ravi Kumar Chowdary',
                            'Aash Aaron',                    'Ackyl Anwari',
            'Adam Collins, Luke Radford',                      'Adam Dasan',
                     'Adamo P. Cultraro',                 'Addison Randall',
       ...
             'Jack Arnold, Earl Bellamy',                    'Mario Caiano',
                       'Mario Siciliano', 'Vesa-Matti Loiri, Spede Pasanen',
                        'Edwin L. Marin',                  'Konrad Petzold',
                        'Richard Pearce',              'Werner W. Wallroth',
                             'Ron Kelly',                   'Fredric Hobbs'],
      dtype='object', name='director', length=85855)

#### select data - by a “partial” label identifying a subgroup in the data.

In [109]:
df2

Unnamed: 0_level_0,category,A,A,B,B,C,C,D,D
Unnamed: 0_level_1,type,one,two,one,two,one,two,one,two
bloodtype,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
X,men,-1.319026,-1.555636,-1.638529,1.222743,0.562824,1.24117,-1.327589,-0.97168
X,women,0.113257,0.33548,-1.151749,0.414416,-0.21489,-0.088367,-0.42618,-1.476091
X,divers,-1.584692,0.080635,-0.236892,-0.343871,-2.102431,-1.561049,0.459635,-1.393493
Y,men,0.345616,1.088824,-0.967043,0.491785,-2.310619,0.224751,2.509271,1.339698
Y,women,-0.452497,-0.13374,0.665746,1.411656,-1.442076,-0.717992,1.550954,-2.255222
Y,divers,0.351113,-0.520398,-1.068307,-1.775115,-0.740112,1.031474,1.013971,0.719363
Z,men,-0.195505,1.032582,-0.681419,1.191336,0.943443,-0.484341,-2.109193,1.3851
Z,women,-0.576821,0.221417,-2.184739,0.611495,0.524643,0.607175,-0.504465,-2.362965
Z,divers,0.204473,-1.519467,1.760388,0.305826,-0.692298,0.762941,-0.976918,0.049021


In [110]:
# columnwise from outer to inner label
df2["A", "one"]

bloodtype  sex   
X          men      -1.319026
           women     0.113257
           divers   -1.584692
Y          men       0.345616
           women    -0.452497
           divers    0.351113
Z          men      -0.195505
           women    -0.576821
           divers    0.204473
Name: (A, one), dtype: float64

In [111]:
# rowwise
df2.loc["X", "men"]

category  type
A         one    -1.319026
          two    -1.555636
B         one    -1.638529
          two     1.222743
C         one     0.562824
          two     1.241170
D         one    -1.327589
          two    -0.971680
Name: (X, men), dtype: float64

In [112]:
# cross-section: selecting data at a particular level of a MultiIndex
df2.xs("men", level="sex")  

category,A,A,B,B,C,C,D,D
type,one,two,one,two,one,two,one,two
bloodtype,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
X,-1.319026,-1.555636,-1.638529,1.222743,0.562824,1.24117,-1.327589,-0.97168
Y,0.345616,1.088824,-0.967043,0.491785,-2.310619,0.224751,2.509271,1.339698
Z,-0.195505,1.032582,-0.681419,1.191336,0.943443,-0.484341,-2.109193,1.3851


In [113]:
# pass drop_level=False to xs to retain the selected level visible
df2.xs("men", level="sex", drop_level=False)

Unnamed: 0_level_0,category,A,A,B,B,C,C,D,D
Unnamed: 0_level_1,type,one,two,one,two,one,two,one,two
bloodtype,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
X,men,-1.319026,-1.555636,-1.638529,1.222743,0.562824,1.24117,-1.327589,-0.97168
Y,men,0.345616,1.088824,-0.967043,0.491785,-2.310619,0.224751,2.509271,1.339698
Z,men,-0.195505,1.032582,-0.681419,1.191336,0.943443,-0.484341,-2.109193,1.3851


In [114]:
# xs also allows selection with multiple keys
df2.xs(("X", "men"), level=("bloodtype", "sex"))


Unnamed: 0_level_0,category,A,A,B,B,C,C,D,D
Unnamed: 0_level_1,type,one,two,one,two,one,two,one,two
bloodtype,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
X,men,-1.319026,-1.555636,-1.638529,1.222743,0.562824,1.24117,-1.327589,-0.97168


In [115]:
# show all levels
df2.columns.levels

FrozenList([['A', 'B', 'C', 'D'], ['one', 'two']])

In [116]:
# show specific level
df2.columns.get_level_values(0)

Index(['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], dtype='object', name='category')

In [117]:
df2

Unnamed: 0_level_0,category,A,A,B,B,C,C,D,D
Unnamed: 0_level_1,type,one,two,one,two,one,two,one,two
bloodtype,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
X,men,-1.319026,-1.555636,-1.638529,1.222743,0.562824,1.24117,-1.327589,-0.97168
X,women,0.113257,0.33548,-1.151749,0.414416,-0.21489,-0.088367,-0.42618,-1.476091
X,divers,-1.584692,0.080635,-0.236892,-0.343871,-2.102431,-1.561049,0.459635,-1.393493
Y,men,0.345616,1.088824,-0.967043,0.491785,-2.310619,0.224751,2.509271,1.339698
Y,women,-0.452497,-0.13374,0.665746,1.411656,-1.442076,-0.717992,1.550954,-2.255222
Y,divers,0.351113,-0.520398,-1.068307,-1.775115,-0.740112,1.031474,1.013971,0.719363
Z,men,-0.195505,1.032582,-0.681419,1.191336,0.943443,-0.484341,-2.109193,1.3851
Z,women,-0.576821,0.221417,-2.184739,0.611495,0.524643,0.607175,-0.504465,-2.362965
Z,divers,0.204473,-1.519467,1.760388,0.305826,-0.692298,0.762941,-0.976918,0.049021


In [118]:
df2["A"].columns # show remaining levels

Index(['one', 'two'], dtype='object', name='type')

In [119]:
df2.loc["X"].columns

MultiIndex([('A', 'one'),
            ('A', 'two'),
            ('B', 'one'),
            ('B', 'two'),
            ('C', 'one'),
            ('C', 'two'),
            ('D', 'one'),
            ('D', 'two')],
           names=['category', 'type'])

In [120]:
# to only leave used levels - X and men are gone
df2.xs(("X", "men"), level=("bloodtype", "sex")).columns.remove_unused_levels()

MultiIndex([('A', 'one'),
            ('A', 'two'),
            ('B', 'one'),
            ('B', 'two'),
            ('C', 'one'),
            ('C', 'two'),
            ('D', 'one'),
            ('D', 'two')],
           names=['category', 'type'])

### reindex

In [121]:
df2
df2.reindex(df2.index[::-1]) # rearrange index by some pattern
df2.reindex([("Y", "women"), ("Z", "women"), ("X", "women"), ("Y", "men"), ("Z", "men"), ("X", "men")]) # rearrange index by hand


Unnamed: 0_level_0,category,A,A,B,B,C,C,D,D
Unnamed: 0_level_1,type,one,two,one,two,one,two,one,two
bloodtype,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
X,men,-1.319026,-1.555636,-1.638529,1.222743,0.562824,1.24117,-1.327589,-0.97168
X,women,0.113257,0.33548,-1.151749,0.414416,-0.21489,-0.088367,-0.42618,-1.476091
X,divers,-1.584692,0.080635,-0.236892,-0.343871,-2.102431,-1.561049,0.459635,-1.393493
Y,men,0.345616,1.088824,-0.967043,0.491785,-2.310619,0.224751,2.509271,1.339698
Y,women,-0.452497,-0.13374,0.665746,1.411656,-1.442076,-0.717992,1.550954,-2.255222
Y,divers,0.351113,-0.520398,-1.068307,-1.775115,-0.740112,1.031474,1.013971,0.719363
Z,men,-0.195505,1.032582,-0.681419,1.191336,0.943443,-0.484341,-2.109193,1.3851
Z,women,-0.576821,0.221417,-2.184739,0.611495,0.524643,0.607175,-0.504465,-2.362965
Z,divers,0.204473,-1.519467,1.760388,0.305826,-0.692298,0.762941,-0.976918,0.049021


## selecting 

In [186]:
df2.T.loc[("A", "one")] # transposed and row selection
df2.loc[("Y", "men"), ("A", "one")] # specfific row ( , ) and column ( , )
df2.A # select columns
df2.A.two # specific columns
df2.loc[(["X", "Z"], ["A", "B"])] # tuple of lists - refers to several values within a level, like crossection/ product of the lists
df2
df2[("A", "one")] # column selection
df2.loc[[("X", "women"), ("Y", "men"), ("Z", "men")]] # list of tuples - indexes several complete MultiIndex keys
df2.loc[("X", "women"):"Y"] # range

Unnamed: 0_level_0,category,A,A,B,B,C,C,D,D
Unnamed: 0_level_1,type,one,two,one,two,one,two,one,two
bloodtype,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
X,women,0.113257,0.33548,-1.151749,0.414416,-0.21489,-0.088367,-0.42618,-1.476091
Y,divers,0.351113,-0.520398,-1.068307,-1.775115,-0.740112,1.031474,1.013971,0.719363
Y,men,0.345616,1.088824,-0.967043,0.491785,-2.310619,0.224751,2.509271,1.339698
Y,women,-0.452497,-0.13374,0.665746,1.411656,-1.442076,-0.717992,1.550954,-2.255222


In [132]:
def mklbl(prefix, n):

    return ["%s%s" % (prefix, i) for i in range(n)]


miindex = pd.MultiIndex.from_product(
    [mklbl("A", 4), mklbl("B", 2),
     mklbl("C", 4), mklbl("D", 2)])

micolumns = pd.MultiIndex.from_tuples([("a", "foo"), ("a", "bar"),
                                       ("b", "foo"), ("b", "bah")],
                                      names=["lvl0", "lvl1"])

dfmi = (pd.DataFrame(
    np.arange(len(miindex) * len(micolumns)).reshape(
        (len(miindex), len(micolumns))),
    index=miindex,
    columns=micolumns,
).sort_index().sort_index(axis=1))

dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


In [133]:
# MultiIndex slicing using slices, lists, and labels
dfmi.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78
A1,B0,C3,D0,89,88,91,90
A1,B0,C3,D1,93,92,95,94
A1,B1,C1,D0,105,104,107,106
A1,B1,C1,D1,109,108,111,110
A1,B1,C3,D0,121,120,123,122
A1,B1,C3,D1,125,124,127,126
A2,B0,C1,D0,137,136,139,138
A2,B0,C1,D1,141,140,143,142


In [134]:
idx = pd.IndexSlice
dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]] # first row indexes than columns ind

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,foo,foo
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C3,D0,24,26
A0,B0,C3,D1,28,30
A0,B1,C1,D0,40,42
A0,B1,C1,D1,44,46
A0,B1,C3,D0,56,58
A0,B1,C3,D1,60,62
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78


In [135]:
mask = dfmi[("a", "foo")] >= 210

dfmi.loc[idx[mask, :, ["C1", "C3"]], idx[:, "foo"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,foo,foo
A3,B0,C3,D0,216,218
A3,B0,C3,D1,220,222
A3,B1,C1,D0,232,234
A3,B1,C1,D1,236,238
A3,B1,C3,D0,248,250
A3,B1,C3,D1,252,254


In [136]:
dfmi.loc(axis=0)[:, :, ["C1"]] # specify the axis argument to .loc to interpret the passed slicers on a single axis.

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B1,C1,D0,41,40,43,42
A0,B1,C1,D1,45,44,47,46
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78
A1,B1,C1,D0,105,104,107,106
A1,B1,C1,D1,109,108,111,110
A2,B0,C1,D0,137,136,139,138
A2,B0,C1,D1,141,140,143,142


In [137]:
#  set the values
dfmi2 = dfmi.copy()
dfmi2.loc(axis=0)[:, :, ["C1", "C3"]] = -10
dfmi2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,-10,-10,-10,-10
A0,B0,C1,D1,-10,-10,-10,-10
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,-10,-10,-10,-10
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,-10,-10,-10,-10


In [138]:
dfmi2.loc[idx[:, :, ["C1", "C3"]], :] = dfmi2 * 1000
dfmi2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,-10000,-10000,-10000,-10000
A0,B0,C1,D1,-10000,-10000,-10000,-10000
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,-10000,-10000,-10000,-10000
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,-10000,-10000,-10000,-10000


In [139]:
dfmi2.xs("C3", level=2)  # axis =0 

Unnamed: 0_level_0,Unnamed: 1_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,lvl1,bar,foo,bah,foo
A0,B0,D0,-10000,-10000,-10000,-10000
A0,B0,D1,-10000,-10000,-10000,-10000
A0,B1,D0,-10000,-10000,-10000,-10000
A0,B1,D1,-10000,-10000,-10000,-10000
A1,B0,D0,-10000,-10000,-10000,-10000
A1,B0,D1,-10000,-10000,-10000,-10000
A1,B1,D0,-10000,-10000,-10000,-10000
A1,B1,D1,-10000,-10000,-10000,-10000
A2,B0,D0,-10000,-10000,-10000,-10000
A2,B0,D1,-10000,-10000,-10000,-10000


In [140]:
dfmi.loc[:, (slice(None), "foo")]
dfmi.xs("foo", level="lvl1", axis=1, drop_level=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,foo,foo
A0,B0,C0,D0,0,2
A0,B0,C0,D1,4,6
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C2,D0,16,18
...,...,...,...,...,...
A3,B1,C1,D1,236,238
A3,B1,C2,D0,240,242
A3,B1,C2,D1,244,246
A3,B1,C3,D0,248,250


In [141]:
dfmi.xs(("a", "foo"), level=("lvl0", "lvl1"), axis=1)
pd.DataFrame(dfmi.loc[:, ("a", "foo")])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,a
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,foo
A0,B0,C0,D0,0
A0,B0,C0,D1,4
A0,B0,C1,D0,8
A0,B0,C1,D1,12
A0,B0,C2,D0,16
...,...,...,...,...
A3,B1,C1,D1,236
A3,B1,C2,D0,240
A3,B1,C2,D1,244
A3,B1,C3,D0,248


## reindexing and alignment

In [142]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


In [143]:
dfmi.groupby(level=2).max()
dfmi.groupby("lvl1", axis=1).mean()
dfmi3 = dfmi.groupby(level=0).mean()
dfmi3

lvl0,a,a,b,b
lvl1,bar,foo,bah,foo
A0,31.0,30.0,33.0,32.0
A1,95.0,94.0,97.0,96.0
A2,159.0,158.0,161.0,160.0
A3,223.0,222.0,225.0,224.0


In [144]:
# By default values in the new index that do not have corresponding records in the dataframe are assigned NaN.
dfmi3.reindex(dfmi.index, level=0) # old index but summerized values

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,31.0,30.0,33.0,32.0
A0,B0,C0,D1,31.0,30.0,33.0,32.0
A0,B0,C1,D0,31.0,30.0,33.0,32.0
A0,B0,C1,D1,31.0,30.0,33.0,32.0
A0,B0,C2,D0,31.0,30.0,33.0,32.0
...,...,...,...,...,...,...,...
A3,B1,C1,D1,223.0,222.0,225.0,224.0
A3,B1,C2,D0,223.0,222.0,225.0,224.0
A3,B1,C2,D1,223.0,222.0,225.0,224.0
A3,B1,C3,D0,223.0,222.0,225.0,224.0


In [145]:
df = pd.DataFrame([[1, 2, 3, 4], [6, 7, 8, 9]],
                  columns=["D", "B", "E", "A"],
                  index=[1, 2])

other = pd.DataFrame(
    [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],
    columns=["A", "B", "C", "D"],
    index=[2, 3, 4],
)

left, right = df.align(other, join="outer", axis=1)
left

Unnamed: 0,A,B,C,D,E
1,4,2,,1,3
2,9,7,,6,8


In [146]:
dfmi3["a", "new"] = [0,0,0,0]
dfmi3.sort_index(axis=1)
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


In [147]:
# Align two objects on their axes with the specified join method.
left, right = dfmi3.align(dfmi, axis=1)
left
right

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,new,bah,foo
A0,B0,C0,D0,1,0,,3,2
A0,B0,C0,D1,5,4,,7,6
A0,B0,C1,D0,9,8,,11,10
A0,B0,C1,D1,13,12,,15,14
A0,B0,C2,D0,17,16,,19,18
...,...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,,239,238
A3,B1,C2,D0,241,240,,243,242
A3,B1,C2,D1,245,244,,247,246
A3,B1,C3,D0,249,248,,251,250


## swaplevel & reorder_levels


In [148]:
# The swaplevel() method can switch the order of two levels
dfmi[:5].swaplevel(0, 1, axis=0)

# permute the hierarchical index levels
dfmi[:5].reorder_levels([3, 1, 2, 0], axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
B0,A0,C0,D0,1,0,3,2
B0,A0,C0,D1,5,4,7,6
B0,A0,C1,D0,9,8,11,10
B0,A0,C1,D1,13,12,15,14
B0,A0,C2,D0,17,16,19,18


## rename index & columns

In [187]:
dfmi.rename(columns={"a": "X", "b": "Y"})
dfmi.rename(index={"A0": "AA", "A2": "AB", "A3":"AC"}, level=0) # along the index,  outermost level
dfmi.rename_axis(index=["A", "B", "C", "D"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lvl0,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lvl1,bar,foo,bah,foo
A,B,C,D,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


## take()
- pandas provides the take() method that retrieves elements along a given axis at the given indices. 
-  because the take method handles a narrower range of inputs, it can offer higher performance

In [202]:
positions = [6, 0, 6]
df2.index[positions]
df2.index.take(positions)
df2.take(positions)
df2.iloc[positions]
df2.take(positions, axis=1)

Unnamed: 0_level_0,category,D,A,D
Unnamed: 0_level_1,type,one,one,one
bloodtype,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
X,divers,0.459635,-1.584692,0.459635
X,men,-1.327589,-1.319026,-1.327589
X,women,-0.42618,0.113257,-0.42618
Y,divers,1.013971,0.351113,1.013971
Y,men,2.509271,0.345616,2.509271
Y,women,1.550954,-0.452497,1.550954
Z,divers,-0.976918,0.204473,-0.976918
Z,men,-2.109193,-0.195505,-2.109193
Z,women,-0.504465,-0.576821,-0.504465


### stack - unstack
- `.stack()` rotates the lowest level of the column `MultiIndex` to the row index 
- `.unstack()` works in the opposite direction)

In [218]:
df2.stack()
df2.unstack()
df2.unstack(0)
pd.DataFrame(df2.stack([0, 1]))
df2.stack(0)
df2


Unnamed: 0_level_0,category,A,A,B,B,C,C,D,D
Unnamed: 0_level_1,type,one,two,one,two,one,two,one,two
bloodtype,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
X,divers,-1.584692,0.080635,-0.236892,-0.343871,-2.102431,-1.561049,0.459635,-1.393493
X,men,-1.319026,-1.555636,-1.638529,1.222743,0.562824,1.24117,-1.327589,-0.97168
X,women,0.113257,0.33548,-1.151749,0.414416,-0.21489,-0.088367,-0.42618,-1.476091
Y,divers,0.351113,-0.520398,-1.068307,-1.775115,-0.740112,1.031474,1.013971,0.719363
Y,men,0.345616,1.088824,-0.967043,0.491785,-2.310619,0.224751,2.509271,1.339698
Y,women,-0.452497,-0.13374,0.665746,1.411656,-1.442076,-0.717992,1.550954,-2.255222
Z,divers,0.204473,-1.519467,1.760388,0.305826,-0.692298,0.762941,-0.976918,0.049021
Z,men,-0.195505,1.032582,-0.681419,1.191336,0.943443,-0.484341,-2.109193,1.3851
Z,women,-0.576821,0.221417,-2.184739,0.611495,0.524643,0.607175,-0.504465,-2.362965


# Pivot Tables
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot_table.html#pandas.DataFrame.pivot_table


In [4]:
df = pd.DataFrame({
    'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
    'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
    'baz': [1, 2, 3, 4, 5, 6],
    'zoo': ['x', 'y', 'z', 'q', 'w', 't']
})
df


Unnamed: 0,foo,bar,baz,zoo
0,one,A,1,x
1,one,B,2,y
2,one,C,3,z
3,two,A,4,q
4,two,B,5,w
5,two,C,6,t


In [5]:
df.pivot(index='foo', columns='bar', values='baz')

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [6]:
df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])

Unnamed: 0_level_0,baz,baz,baz,zoo,zoo,zoo
bar,A,B,C,A,B,C
foo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
one,1,2,3,x,y,z
two,4,5,6,q,w,t


In [8]:
df = pd.DataFrame({
    "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
    "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
    "C": [ "small", "large", "large", "small", "small", "large", "small", "small", "large" ],
    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
    "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]
})

df 


Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [32]:
table = pd.pivot_table(df,
                       values=['D', 'E'],
                       index=['A'],
                       columns=['C'],
                       aggfunc=np.sum,
                       fill_value=0) # fills NaN with 0

table

Unnamed: 0_level_0,D,D,E,E
C,large,small,large,small
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,11,11,15,17
foo,4,7,9,13


In [11]:
table = pd.pivot_table(
    df,
    values=['D', 'E'],
    index=['A', 'C'],
    aggfunc={'D': np.mean, 'E': np.mean})

table

Unnamed: 0_level_0,Unnamed: 1_level_0,D,E
A,C,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,large,5.5,7.5
bar,small,5.5,8.5
foo,large,2.0,4.5
foo,small,2.333333,4.333333


In [37]:
table = pd.pivot_table(df,
        values=['D', 'E'],
        index=['A', 'C'],
        aggfunc={
            'D': np.mean,
            'E': [min, max, np.mean]})

table

Unnamed: 0_level_0,Unnamed: 1_level_0,D,E,E,E
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,mean,min
A,C,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,large,5.5,9,7.5,6
bar,small,5.5,9,8.5,8
foo,large,2.0,5,4.5,4
foo,small,2.333333,6,4.333333,2


# openpyxl
- export database info in a spreadsheet
- manipulate an existing spreadsheet

In [141]:
from openpyxl.workbook import Workbook
from openpyxl import load_workbook

# A workbook is always created with at least one worksheet.
wb = Workbook()
ws = wb.active  # aselects the first available sheet
ws.title = 'MySheet'

ws1=wb.create_sheet('NewSheet')
ws2=wb.create_sheet('AnotherSheet', 0) # position of worksheet

ws["A1"] = "hello" # insert info
ws["B1"] = "world!"
wb.save('data/test_wb.xlsx')

In [57]:
print(wb.sheetnames)  # see all the sheets you have available
print(ws.title)
print(ws['a1'].value)
print(ws.cell(row=1,column=1).value)
print(ws.cell(1,1).value)
print(ws.cell(1,2).value)


['AnotherSheet', 'MySheet', 'NewSheet']
MySheet
hello
hello
hello
world!


In [153]:
cell_range = ws['A1':'b1']
[x.value for x in cell_range[0]]

['hello', 'world!']

# open external spreadsheet
arguments you can pass to load_workbook()
  1. **read_only** loads a spreadsheet in read-only mode allowing you to open very large Excel files.
  2.  **data_only** ignores loading formulas and instead loads only the resulting values.


In [4]:
wb2 = load_workbook('data/fao.xlsx') # load external file

In [31]:
fao = wb2.active
headers = fao[1]

[cell.value for cell in headers]

['Date', 'Food Price Index', 'Meat', 'Dairy', 'Cereals', 'Oils', 'Sugar']

In [137]:
for row in fao.iter_rows(min_row=1, max_row=14, min_col=1, max_col=7, values_only=True):
    print(row)

('Date', 'Food Price Index', 'Meat', 'Dairy', 'Cereals', 'Oils', 'Sugar')
(datetime.datetime(1990, 1, 1, 0, 0), 64.1, 73.4, 53.5, 64.1, 44.59, 87.9)
(datetime.datetime(1990, 2, 1, 0, 0), 64.5, 76, 52.2, 62.2, 44.5, 90.7)
(datetime.datetime(1990, 3, 1, 0, 0), 63.8, 77.8, 41.4, 61.3, 45.75, 95.1)
(datetime.datetime(1990, 4, 1, 0, 0), 65.8, 80.4, 48.4, 62.8, 44.02, 94.3)
(datetime.datetime(1990, 5, 1, 0, 0), 64.4, 81, 39.2, 62, 45.5, 90.4)
(datetime.datetime(1990, 6, 1, 0, 0), 63.7, 83.1, 39.2, 60.7, 43.8, 80.3)
(datetime.datetime(1990, 7, 1, 0, 0), 62.5, 83.4, 39.2, 57.9, 43.72, 74.2)
(datetime.datetime(1990, 8, 1, 0, 0), 61.5, 83.7, 36.8, 55.7, 45.37, 67.6)
(datetime.datetime(1990, 9, 1, 0, 0), 61, 84.5, 38.1, 52.5, 44.79, 68.5)
(datetime.datetime(1990, 10, 1, 0, 0), 61.1, 85.4, 38.9, 52.8, 45.72, 60.8)
(datetime.datetime(1990, 11, 1, 0, 0), 61.9, 86.2, 39.4, 52.4, 48.3, 62.3)
(datetime.datetime(1990, 12, 1, 0, 0), 61.9, 83.7, 45.1, 52.6, 49.6, 60.3)
(datetime.datetime(1991, 1, 1, 0, 0)

In [142]:
for col in fao.iter_cols(min_row=1, max_row=5, min_col=1, max_col=7, values_only=True):
    print(col)

('Date', datetime.datetime(1990, 1, 1, 0, 0), datetime.datetime(1990, 2, 1, 0, 0), datetime.datetime(1990, 3, 1, 0, 0), datetime.datetime(1990, 4, 1, 0, 0))
('Food Price Index', 64.1, 64.5, 63.8, 65.8)
('Meat', 73.4, 76, 77.8, 80.4)
('Dairy', 53.5, 52.2, 41.4, 48.4)
('Cereals', 64.1, 62.2, 61.3, 62.8)
('Oils', 44.59, 44.5, 45.75, 44.02)
('Sugar', 87.9, 90.7, 95.1, 94.3)


In [None]:
# go through all rows
for row in fao.rows:
    print(row)

In [None]:
# go through all columns
for col in fao.columns:
    print(col)

In [None]:
# whole column
for cell in fao['B']:
    print(cell.value)

In [97]:
# one row 
for date in fao[3]:
    print(date.value)

1990-02-01 00:00:00
64.5
76
52.2
62.2
44.5
90.7


In [134]:
# one row as a list
[date.value for date in fao[2]]

[datetime.datetime(1990, 1, 1, 0, 0), 64.1, 73.4, 53.5, 64.1, 44.59, 87.9]

In [140]:
import json
from openpyxl import load_workbook

workbook = load_workbook(filename='data/fao.xlsx')
sheet = workbook.active

food_price = {}

# Using the values_only because you want to return the cells' values
for row in sheet.iter_rows(min_row=2, min_col=1, max_col=7, values_only=True):
    date= row[0].year
    prices = {"fpi": row[1], "meat": row[2], "sugar": row[6]}
    food_price[date] = prices

# Using json here to be able to format the output for displaying later
print(json.dumps(food_price))

{"1990": {"fpi": 61.9, "meat": 83.7, "sugar": 60.3}, "1991": {"fpi": 64.1, "meat": 78.3, "sugar": 55.7}, "1992": {"fpi": 61, "meat": 72.8, "sugar": 50.4}, "1993": {"fpi": 64.3, "meat": 70.1, "sugar": 65.1}, "1994": {"fpi": 73.1, "meat": 78.2, "sugar": 90.8}, "1995": {"fpi": 78, "meat": 81.2, "sugar": 76.2}, "1996": {"fpi": 71.4, "meat": 81, "sugar": 66.5}, "1997": {"fpi": 68, "meat": 69.5, "sugar": 76.3}, "1998": {"fpi": 62.1, "meat": 63.3, "sugar": 50}, "1999": {"fpi": 51.9, "meat": 58.9, "sugar": 37.2}, "2000": {"fpi": 54.4, "meat": 59.3, "sugar": 62}, "2001": {"fpi": 54.3, "meat": 58.2, "sugar": 48.5}, "2002": {"fpi": 55.6, "meat": 51.1, "sugar": 49.2}, "2003": {"fpi": 62.4, "meat": 61.2, "sugar": 38.9}, "2004": {"fpi": 65.6, "meat": 68.6, "sugar": 51}, "2005": {"fpi": 69.5, "meat": 73.6, "sugar": 82.6}, "2006": {"fpi": 78.6, "meat": 70.6, "sugar": 71.6}, "2007": {"fpi": 114.4, "meat": 81, "sugar": 66.3}, "2008": {"fpi": 86, "meat": 80.8, "sugar": 72.7}, "2009": {"fpi": 100.7, "meat