### Intro to the MultiIndex Module

In [1]:
import pandas as pd

In [27]:
## MultiIndex: Multiple Layers/Levels of Index
bigmac = pd.read_csv('bigmac.csv', parse_dates = ['Date'])
bigmac.head(3)

Unnamed: 0,Date,Country,Price in US Dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35


In [9]:
bigmac.dtypes
bigmac.info()  ## There is no null values here

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 3 columns):
Date                   652 non-null datetime64[ns]
Country                652 non-null object
Price in US Dollars    652 non-null float64
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 15.4+ KB


### Create a MultiIndex with the set_index() Method

In [28]:
bigmac.set_index(keys = 'Country').head(3)   ## Single Index Layer

Unnamed: 0_level_0,Date,Price in US Dollars
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Argentina,2016-01-01,2.39
Australia,2016-01-01,3.74
Brazil,2016-01-01,3.35


In [29]:
bigmac.set_index(keys = ['Date', 'Country'], inplace = True)  ## MultiLayers, specify that in a Python List
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35


In [17]:
bigmac.set_index(keys = ['Country', 'Date']).head(3)    ## reverse the order, not as good as the previous order
## Usually, the former layer should have less unique values than the next layer

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Country,Date,Unnamed: 2_level_1
Argentina,2016-01-01,2.39
Australia,2016-01-01,3.74
Brazil,2016-01-01,3.35


In [31]:
bigmac.sort_index(inplace = True)    ## Sort the multiIndex in ascending order by date and then by country
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [36]:
bigmac.index   ## Index list consists of two lists
bigmac.index.names

FrozenList(['Date', 'Country'])

In [37]:
type(bigmac.index)

pandas.core.indexes.multi.MultiIndex

In [44]:
bigmac.index[0]   ## gives me the very first row value of this multiIndex -- tuple (outlayer, innerlayer)
## we use this to extract row values, but bigmac[bigmac.index[0]] does not work, we need to use .get_level_values()

(Timestamp('2010-01-01 00:00:00'), 'Argentina')

### The .get_level_values() Method

In [58]:
## .get_index_values() function is called on the multiIndex of a dataframe to extract one of the levels more specifically
## by the values of the specific layer
## Another way to set multiple index instead of using .set_index() function
bigmac = pd.read_csv('bigmac.csv', parse_dates = ['Date'], index_col = ['Date', 'Country'])
bigmac.sort_index(inplace = True)
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [51]:
bigmac.index  ## This is a pandas multiIndex object which also has the functions or attributes to apply on
bigmac.index.get_level_values(0)   ## The argument of .get_level_values() can be number of name of the multiIndex
bigmac.index.get_level_values(1)  ## This is not deduped which including all of the values within the country layer

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)

In [53]:
bigmac.index.get_level_values('Date')    ## We can also put name to this function
bigmac.index.get_level_values('Country')

Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam'],
      dtype='object', name='Country', length=652)

### The .set_names() Method

In [57]:
## The .set_name() function is called on the index attribute of our dataframe(multiIndex)
## Change the name of the indexes
bigmac.index.set_names(['Day', 'Location'], inplace = True)  ## provide a full list of names(including new/old)
bigmac.head(3)
## Another way to specific rename an index

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Day,Location,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


### The .sort_index() Mehod on a MultiIndex DataFrame

In [60]:
bigmac.sort_index(ascending = [True, False], inplace = True)
## or we can use the level= argument to target the levels we want to order/sor
## .sort_index() function sort occurs on every layer, if we want to sort layers differently, we provide a boolean list
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


### Extract Rows from a MultiIndex DataFrame

In [65]:
bigmac.loc[bigmac.index.get_level_values(1)=='China']   ## My stupid approach!

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,China,1.83
2010-07-01,China,1.95
2011-07-01,China,2.27
2012-01-01,China,2.44
2012-07-01,China,2.45
2013-01-01,China,2.57
2013-07-01,China,2.61
2014-01-01,China,2.74
2014-07-01,China,2.73
2015-01-01,China,2.77


In [70]:
## use the tuple instead of python list in the .loc[],  due to pandas design, list here does not work
## bigmac.loc[['2011-07-01]]
bigmac.loc[('2011-07-01', 'China')]  ## returns an array/series
## bigmac.loc[('2011-07-01', 'China'), 'Vertical Label we want to extract']
bigmac.loc[('2015-07-01', 'Chile'), 'Price in US Dollars']

Date        Country
2015-07-01  Chile      3.27
Name: Price in US Dollars, dtype: float64

In [69]:
bigmac.loc[('2011-07-01', 'China'), 'Price in US Dollars']

Date        Country
2011-07-01  China      2.27
Name: Price in US Dollars, dtype: float64

In [72]:
bigmac.ix[('2016-01-01')].head(3)  # It will cut out the outlayer, different from .loc[] function

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


Unnamed: 0_level_0,Price in US Dollars
Country,Unnamed: 1_level_1
Argentina,2.39
Australia,3.74
Austria,3.76


In [74]:
bigmac.ix[('2016-01-01', 'China'), 'Price in US Dollars']
## bigmac.ix[('2016-01-01', 'China'), 0]   alternative way

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


2.68

### The .transpose() Method and MultiIndex on Column Level

In [78]:
## MultiIndex can on vertical axis among columns
bigmac = bigmac.transpose()  
## Now we have multiIdex columns, .transpose() function does not affect the original dataframe, we need to reassign
bigmac.head(1)

Date,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,...,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01
Country,Argentina,Australia,Brazil,Britain,Canada,Chile,China,Colombia,Costa Rica,Czech Republic,...,Switzerland,Taiwan,Thailand,Turkey,UAE,Ukraine,United States,Uruguay,Venezuela,Vietnam
Price in US Dollars,1.84,3.98,4.76,3.67,3.97,3.18,1.83,3.91,3.52,3.71,...,6.44,2.08,3.09,3.41,3.54,1.54,4.93,3.74,0.66,2.67


In [82]:
## Extract from every layer of my column based MultiIndex
bigmac.ix['Price in US Dollars', ('2016-01-01', 'Denmark') ]
## More: we can have Multi-Indexes on both columns and indexs(rows), .loc[(), ()] -- provide two tuples in ,loc[]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


4.32

### The .swaplevel( ) Method

In [3]:
## .swaplevel() fnction swap the levels within MultiIndex
bigmac = pd.read_csv('bigmac.csv', index_col = ['Date', 'Country'], parse_dates = ['Date'])
bigmac.sort_index(inplace = True)
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [6]:
## Look at how to swap levels when you have more than 2 index levels, .swaplevel() does not have inplace = prameter
bigmac = bigmac.swaplevel()
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Country,Date,Unnamed: 2_level_1
Argentina,2010-01-01,1.84
Australia,2010-01-01,3.98
Brazil,2010-01-01,4.76


### The .stack() Method

In [48]:
## .stack() function take the columns index and moves that index to the main index the one on the left(the horizontal one)
## the usecols =  and index_col = can not be used at the same time
## Create a multiIndex DataFrame
world = pd.read_csv('worldstats.csv', index_col = ['country', 'year'])
world.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015,392022276.0,2530102000000.0
Arab World,2014,384222592.0,2873600000000.0
Arab World,2013,376504253.0,2846994000000.0


In [50]:
## .stack() basically takes the last(default) column based index and moves it to be on the row based index
## this function double the size of the data from a row perspective
world.stack().head(4)  ## Now it becomes one dimension(previous two dimensions) pandas series

country     year            
Arab World  2015  Population    3.920223e+08
                  GDP           2.530102e+12
            2014  Population    3.842226e+08
                  GDP           2.873600e+12
dtype: float64

In [51]:
type(world.stack())

pandas.core.series.Series

In [67]:
## .to_frame() converts a pandas series to a dataframe which will keep the multiIndex 
world.stack().to_frame().head(3)  ### the column name can be changed by using rename

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
country,year,Stat,Unnamed: 3_level_1
Arab World,2015,Population,392022300.0
Arab World,2015,GDP,2530102000000.0
Arab World,2014,Population,384222600.0


### The .unstack() Method, Part 1

In [68]:
## .unstack() function taks an outer layer of a multiIndex and moves it to serve as columns.
world  = pd.read_csv('worldstats.csv', index_col = ['country', 'year'])
world.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015,392022276.0,2530102000000.0
Arab World,2014,384222592.0,2873600000000.0
Arab World,2013,376504253.0,2846994000000.0


In [71]:
s = world.stack()  ## Returns a three-layer multiIndex series
s.head(3)

country     year            
Arab World  2015  Population    3.920223e+08
                  GDP           2.530102e+12
            2014  Population    3.842226e+08
dtype: float64

In [73]:
s.unstack().head(3)  ## The .unstack() function always works on the most inner layer first

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1960,8994793.0,537777800.0
Afghanistan,1961,9164945.0,548888900.0
Afghanistan,1962,9343772.0,546666700.0


In [75]:
s.unstack().unstack().head(3)  ## Now returns a multiIndex columns structure

Unnamed: 0_level_0,Population,Population,Population,Population,Population,Population,Population,Population,Population,Population,...,GDP,GDP,GDP,GDP,GDP,GDP,GDP,GDP,GDP,GDP
year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Afghanistan,8994793.0,9164945.0,9343772.0,9531555.0,9728645.0,9935358.0,10148841.0,10368600.0,10599790.0,10849510.0,...,7057598000.0,9843842000.0,10190530000.0,12486940000.0,15936800000.0,17930240000.0,20536540000.0,20046330000.0,20050190000.0,19199440000.0
Albania,,,,,,,,,,,...,8992642000.0,10701010000.0,12881350000.0,12044210000.0,11926950000.0,12890870000.0,12319780000.0,12781030000.0,13277960000.0,11455600000.0
Algeria,11124892.0,11404859.0,11690152.0,11985130.0,12295973.0,12626953.0,12980269.0,13354197.0,13744383.0,14144437.0,...,117027300000.0,134977100000.0,171000700000.0,137211000000.0,161207300000.0,200013100000.0,209047400000.0,209703500000.0,213518500000.0,166838600000.0


In [77]:
cols_s = s.unstack().unstack().unstack()
cols_s.head(3)

            year  country    
Population  1960  Afghanistan     8994793.0
                  Albania               NaN
                  Algeria        11124892.0
dtype: float64

In [78]:
type(cols_s)  ## .stack() does not work on series, .unstack() can work on series

pandas.core.series.Series

### The .unstack() Method, Part 2

In [84]:
## .unstack() function can be provided the index that we want to move from our row multiIndex, eg: country is layer 0
s.unstack(2).head(3)   ## in our example is the same by s.unstack()  default

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1960,8994793.0,537777800.0
Afghanistan,1961,9164945.0,548888900.0
Afghanistan,1962,9343772.0,546666700.0


In [87]:
s.unstack(level = 0).head(3)

Unnamed: 0_level_0,country,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Arab World,Argentina,Armenia,Aruba,...,Uzbekistan,Vanuatu,"Venezuela, RB",Vietnam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1960,Population,8994793.0,,11124890.0,,,,,,,,...,,,8146845.0,,32000.0,,3035056000.0,,3049586.0,3752390.0
1960,GDP,537777800.0,,2723638000.0,,,,,,,,...,,,8607600000.0,,24200000.0,,1364643000000.0,,698739700.0,1052990000.0
1961,Population,9164945.0,,11404860.0,,,,,,,,...,,,8461684.0,,34100.0,,3076121000.0,,3142848.0,3876638.0


In [89]:
s.unstack(level=-1).head(3) ## select the backwards last one to be unstacked in the column layer, same as s.unstack(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1960,8994793.0,537777800.0
Afghanistan,1961,9164945.0,548888900.0
Afghanistan,1962,9343772.0,546666700.0


In [91]:
s.unstack(-2)  ## is the same as s.unstack(1)
s.unstack(level='year').head(3)   ## is the same as s.unstack(1)
## In order to maintain the data composition in terms of its column names, pandas will give NaN value if does not exist

Unnamed: 0_level_0,year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,Population,8994793.0,9164945.0,9343772.0,9531555.0,9728645.0,9935358.0,10148840.0,10368600.0,10599790.0,10849510.0,...,25183620.0,25877540.0,26528740.0,27207290.0,27962210.0,28809170.0,29726800.0,30682500.0,31627510.0,32526560.0
Afghanistan,GDP,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,1400000000.0,1673333000.0,1373333000.0,1408889000.0,...,7057598000.0,9843842000.0,10190530000.0,12486940000.0,15936800000.0,17930240000.0,20536540000.0,20046330000.0,20050190000.0,19199440000.0
Albania,Population,,,,,,,,,,,...,2992547.0,2970017.0,2947314.0,2927519.0,2913021.0,2904780.0,2900247.0,2896652.0,2893654.0,2889167.0


### The .unstack() Method, Part 3

In [95]:
## unstack multiple layers, migrating more than one of the layers in a multiIndex to the columns axis
## We cna provide a list in the .unstack() function which include the index positions of the layers/names
s.unstack(level = ['year', 'country'])  ## for columns axis, the outer layer will be year first and then country
## The sames as s.unstack(level = [1, 0])

year,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,...,1969,1968,1967,1966,1965,1964,1963,1962,1961,1960
country,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,...,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe
Population,392022300.0,384222600.0,376504300.0,368802600.0,361031800.0,353112200.0,345054200.0,336886500.0,328766600.0,320906700.0,...,5036321.0,4874113.0,4718612.0,4568320.0,4422132.0,4279561.0,4140804.0,4006262.0,3876638.0,3752390.0
GDP,2530102000000.0,2873600000000.0,2846994000000.0,2773270000000.0,2497945000000.0,2103825000000.0,1798878000000.0,2081343000000.0,1641666000000.0,1404190000000.0,...,1747999000.0,1479600000.0,1397002000.0,1281750000.0,1311436000.0,1217138000.0,1159512000.0,1117602000.0,1096647000.0,1052990000.0


In [97]:
s.unstack(level = ['country', 'year'])
## s.unstack(level = [0, 1])

country,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,...,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe
year,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,...,1969,1968,1967,1966,1965,1964,1963,1962,1961,1960
Population,392022300.0,384222600.0,376504300.0,368802600.0,361031800.0,353112200.0,345054200.0,336886500.0,328766600.0,320906700.0,...,5036321.0,4874113.0,4718612.0,4568320.0,4422132.0,4279561.0,4140804.0,4006262.0,3876638.0,3752390.0
GDP,2530102000000.0,2873600000000.0,2846994000000.0,2773270000000.0,2497945000000.0,2103825000000.0,1798878000000.0,2081343000000.0,1641666000000.0,1404190000000.0,...,1747999000.0,1479600000.0,1397002000.0,1281750000.0,1311436000.0,1217138000.0,1159512000.0,1117602000.0,1096647000.0,1052990000.0


In [98]:
s.unstack(level = [0, 'year'])  ## This combination also works

country,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,...,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe
year,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,...,1969,1968,1967,1966,1965,1964,1963,1962,1961,1960
Population,392022300.0,384222600.0,376504300.0,368802600.0,361031800.0,353112200.0,345054200.0,336886500.0,328766600.0,320906700.0,...,5036321.0,4874113.0,4718612.0,4568320.0,4422132.0,4279561.0,4140804.0,4006262.0,3876638.0,3752390.0
GDP,2530102000000.0,2873600000000.0,2846994000000.0,2773270000000.0,2497945000000.0,2103825000000.0,1798878000000.0,2081343000000.0,1641666000000.0,1404190000000.0,...,1747999000.0,1479600000.0,1397002000.0,1281750000.0,1311436000.0,1217138000.0,1159512000.0,1117602000.0,1096647000.0,1052990000.0


In [105]:
## Fill in the inexist values, .stack() and .unstack() function do not have inplace =  parameter, we need to reassign to overwrite
ss = s.unstack('year', fill_value = 0)
ss.head(3)

Unnamed: 0_level_0,year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,Population,8994793.0,9164945.0,9343772.0,9531555.0,9728645.0,9935358.0,10148840.0,10368600.0,10599790.0,10849510.0,...,25183620.0,25877540.0,26528740.0,27207290.0,27962210.0,28809170.0,29726800.0,30682500.0,31627510.0,32526560.0
Afghanistan,GDP,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,1400000000.0,1673333000.0,1373333000.0,1408889000.0,...,7057598000.0,9843842000.0,10190530000.0,12486940000.0,15936800000.0,17930240000.0,20536540000.0,20046330000.0,20050190000.0,19199440000.0
Albania,Population,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2992547.0,2970017.0,2947314.0,2927519.0,2913021.0,2904780.0,2900247.0,2896652.0,2893654.0,2889167.0


### The .pivot() Method

In [113]:
## .pivot() function is used to reorient a dataset, take the values in a column to the column headers
sales = pd.read_csv('salesmen.csv', parse_dates = ['Date'])
sales['Salesman'] = sales['Salesman'].astype('category')
sales.head(3)

Unnamed: 0,Date,Salesman,Revenue
0,2016-01-01,Bob,7172
1,2016-01-02,Bob,6362
2,2016-01-03,Bob,5982


In [111]:
len(sales)

1830

In [112]:
sales['Salesman'].value_counts()   ## five salesman

Ronald    366
Oscar     366
Jeb       366
Bob       366
Dave      366
Name: Salesman, dtype: int64

In [120]:
## arguments/parameters in the .pivot() function:
## 1. (index = ) this asks the thing we want to put on the left, eg: date in sales
## 2. (columns = ) asks what columns from the original dataframe would like to use as the values that will comprise the new columns
## 3. (values = )  asks the values which is the intersect of date and salesman, eg: revenue
sales.pivot(index = 'Date', columns = 'Salesman', values = 'Revenue').head(3)

Salesman,Bob,Dave,Jeb,Oscar,Ronald
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01,7172,1864,4430,5250,2639
2016-01-02,6362,8278,8026,8661,4951
2016-01-03,5982,4226,5188,7075,2703


In [129]:
sales.groupby('Salesman')['Revenue'].sum()  ## An example not for this module

Salesman
Bob       1827179
Dave      1859063
Jeb       1918418
Oscar     1777779
Ronald    1827112
Name: Revenue, dtype: int64

### The .pivot_table() Method

In [130]:
## .pivot_table() function is for taking a dataframe and aggregating it to look at the values(groupings, sum, mean)
foods = pd.read_csv('foods.csv')
foods.head(3)

Unnamed: 0,First Name,Gender,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15.66
1,Eric,Male,Stamford,Daily,Chalupa,10.56
2,Charles,Male,New York,Never,Sushi,42.14


In [139]:
## parameters in .pivot_table() function: 1. values = put the things we want to aggregate here
## 2. index = will be variable we want to group by, we will have an output that have number for each group
## 3. aggfunc = 'mean' is the default
## 4. columns = add new columns in the output
foods.pivot_table(values = 'Spend', index = 'Gender', aggfunc = 'mean')
foods.pivot_table(values = 'Spend', index = 'Gender', aggfunc = 'sum')

Unnamed: 0_level_0,Spend
Gender,Unnamed: 1_level_1
Female,25963.33
Male,24106.04


In [133]:
foods.pivot_table(values = 'Spend', index = 'Item', aggfunc = 'sum')

Unnamed: 0_level_0,Spend
Item,Unnamed: 1_level_1
Burger,7765.73
Burrito,8270.44
Chalupa,7644.52
Donut,8758.76
Ice Cream,8886.99
Sushi,8742.93


In [137]:
## We can also provide a list to the index parameter, then it will create a multiIndex series for us
foods.pivot_table(values = 'Spend', index = ['Gender', 'Item'], aggfunc = 'mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,Spend
Gender,Item,Unnamed: 2_level_1
Female,Burger,49.930488
Female,Burrito,50.092
Female,Chalupa,54.635
Female,Donut,49.926316
Female,Ice Cream,49.788519
Female,Sushi,50.355699
Male,Burger,49.613919
Male,Burrito,48.344819
Male,Chalupa,49.186761
Male,Donut,43.649565


In [140]:
## We can also create additional columns if we want to separate the data(not add more layer, put it to the columns)
foods.pivot_table(values = 'Spend', index = 'Item', columns = 'City', aggfunc = 'sum')

City,New York,Philadelphia,Stamford
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Burger,2533.13,2577.42,2655.18
Burrito,2378.35,2771.69,3120.4
Chalupa,2104.35,2787.56,2752.61
Donut,2792.05,2888.62,3078.09
Ice Cream,3125.25,3670.49,2091.25
Sushi,2876.44,3138.76,2727.73


In [141]:
foods.pivot_table(values = 'Spend', index = 'Item', columns = ['Frequency', 'City'], aggfunc = 'sum')

Frequency,Daily,Daily,Daily,Monthly,Monthly,Monthly,Never,Never,Never,Often,...,Once,Seldom,Seldom,Seldom,Weekly,Weekly,Weekly,Yearly,Yearly,Yearly
City,New York,Philadelphia,Stamford,New York,Philadelphia,Stamford,New York,Philadelphia,Stamford,New York,...,Stamford,New York,Philadelphia,Stamford,New York,Philadelphia,Stamford,New York,Philadelphia,Stamford
Item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Burger,582.13,344.38,342.38,359.15,428.19,265.65,188.21,236.28,90.97,150.43,...,421.69,288.84,597.11,257.28,254.04,131.89,465.23,308.91,320.53,532.1
Burrito,460.66,487.33,471.86,270.28,123.86,187.54,276.51,760.62,718.18,293.52,...,198.16,369.94,225.82,244.33,296.43,352.07,392.76,291.96,235.26,435.62
Chalupa,97.28,298.3,288.34,492.13,380.66,505.74,234.24,364.95,295.06,291.25,...,40.59,171.69,405.26,349.27,334.72,220.09,249.06,325.56,573.25,464.35
Donut,708.1,438.11,253.77,468.26,309.54,348.83,275.7,487.23,376.97,270.43,...,298.13,107.06,282.26,267.97,360.0,324.6,751.33,206.05,219.84,241.0
Ice Cream,440.41,326.38,108.05,244.73,461.06,200.21,263.69,477.97,197.43,579.81,...,462.04,512.77,341.54,158.56,451.48,506.52,364.45,311.37,399.8,111.56
Sushi,306.64,629.46,393.27,524.02,234.3,201.42,233.34,436.35,224.46,353.25,...,509.4,248.19,405.5,251.97,561.04,594.33,653.52,351.85,179.65,411.45


In [144]:
foods.pivot_table(values = 'Spend', index = ['Gender', 'Item'], columns = 'City', aggfunc = 'count')  ## gives the number of records in that group
foods.pivot_table(values = 'Spend', index = ['Gender', 'Item'], columns = 'City', aggfunc = 'max')
foods.pivot_table(values = 'Spend', index = ['Gender', 'Item'], columns = 'City', aggfunc = 'min')


Unnamed: 0_level_0,City,New York,Philadelphia,Stamford
Gender,Item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,Burger,2.25,1.97,6.24
Female,Burrito,1.02,1.04,1.18
Female,Chalupa,1.96,9.35,9.09
Female,Donut,3.15,2.13,1.68
Female,Ice Cream,13.39,7.61,8.8
Female,Sushi,2.52,11.68,8.2
Male,Burger,5.43,1.71,2.83
Male,Burrito,15.9,8.58,3.64
Male,Chalupa,11.61,1.94,10.56
Male,Donut,1.49,1.26,6.63


In [None]:
## Another way to provide the summary pivot table: directly in pandas
pd.pivot_table(data = df, index = , columns = , values =, aggfunc = 'mean')

In [146]:
## call the .pivot_table() function directly on the dataframe or directly on pd library and then provide the dataframe will have same results
pd.pivot_table(data = foods, values = 'Spend', index = ['Gender', 'Item'], columns = 'City', aggfunc = 'count').head(3)

Unnamed: 0_level_0,City,New York,Philadelphia,Stamford
Gender,Item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,Burger,24,31,27
Female,Burrito,23,28,34
Female,Chalupa,19,32,25


### The pd.melt() Method

In [147]:
## pd.melt() function is reverse operation to a .pivot_table() and takes an aggregated dataset then converts it to tabular format
sales = pd.read_csv('quarters.csv')
sales.head(3)  ## This is a pivot table

Unnamed: 0,Salesman,Q1,Q2,Q3,Q4
0,Boris,602908,233879,354479,32704
1,Bob,43790,514863,297151,544493
2,Tommy,392668,113579,430882,247231


In [155]:
## Now we want the tabular format of sales and four size of original datasets
## parameters: 1. frame = dataset  2. id_vars =  put the columns that we want to maintain
## 3. var_name = new generated column header  4. value_name = 'value' is default
pd.melt(frame = sales, id_vars = 'Salesman', var_name = 'Quarter', value_name = 'Revenue' ).sort_values(by='Salesman').head()

Unnamed: 0,Salesman,Quarter,Revenue
1,Bob,Q1,43790
19,Bob,Q3,297151
10,Bob,Q2,514863
28,Bob,Q4,544493
0,Boris,Q1,602908


In [156]:
tf = pd.melt(frame = sales, id_vars = 'Salesman', var_name = 'Quarter', value_name = 'Revenue' ).sort_values(by='Salesman')

In [157]:
tf.pivot_table(index = 'Salesman', values = 'Revenue', aggfunc = 'sum')

Unnamed: 0_level_0,Revenue
Salesman,Unnamed: 1_level_1
Bob,1400297
Boris,1223970
Donald,1754276
Jeb,2234605
Morgan,2203511
Stacy,1995681
Ted,1424783
Tommy,1184360
Travis,2421210
