In [2]:
import pandas as pd
import numpy as np

In [13]:
# Read in rock.csv as a data frame
df = pd.read_csv("../../assets/datasets/rock.csv")
print(df.shape)
print(df.dtypes)
print(df.head())

(2230, 8)
Song Clean      object
ARTIST CLEAN    object
Release Year    object
COMBINED        object
First?           int64
Year?            int64
PlayCount        int64
F*G              int64
dtype: object
               Song Clean ARTIST CLEAN Release Year  \
0        Caught Up in You  .38 Special         1982   
1            Fantasy Girl  .38 Special          NaN   
2         Hold On Loosely  .38 Special         1981   
3  Rockin' Into the Night  .38 Special         1980   
4       Art For Arts Sake         10cc         1975   

                                COMBINED  First?  Year?  PlayCount  F*G  
0        Caught Up in You by .38 Special       1      1         82   82  
1            Fantasy Girl by .38 Special       1      0          3    0  
2         Hold On Loosely by .38 Special       1      1         85   85  
3  Rockin' Into the Night by .38 Special       1      1         18   18  
4              Art For Arts Sake by 10cc       1      1          1    1  


In [7]:
# The simplest pivot table must have a dataframe and an 
# index . Let’s use the ARTIST CLEAN as our index.
pd.pivot_table(df,index="ARTIST CLEAN")

Unnamed: 0_level_0,F*G,First?,PlayCount,Year?
ARTIST CLEAN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
.38 Special,46.250000,1.0,47.000000,0.750000
10cc,1.000000,1.0,1.000000,1.000000
3 Doors Down,6.666667,1.0,6.666667,1.000000
4 Non Blondes,3.000000,1.0,3.000000,1.000000
AC/DC,26.275862,1.0,29.862069,0.689655
Ace,1.000000,1.0,1.000000,1.000000
Adelitas Way,4.000000,1.0,4.000000,1.000000
Aerosmith,23.645161,1.0,26.225806,0.806452
Alanis Morissette,3.500000,1.0,3.500000,1.000000
Alannah Myles,1.000000,1.0,1.000000,1.000000


In [8]:
# How about indexing on multiple values. Let's look at the 
# data by ARTIST CLEAN and Release Year next. 
pd.pivot_table(df,index=["ARTIST CLEAN","Release Year"])

Unnamed: 0_level_0,Unnamed: 1_level_0,F*G,First?,PlayCount,Year?
ARTIST CLEAN,Release Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
.38 Special,1980,18.000000,1.0,18.000000,1.0
.38 Special,1981,85.000000,1.0,85.000000,1.0
.38 Special,1982,82.000000,1.0,82.000000,1.0
10cc,1975,1.000000,1.0,1.000000,1.0
3 Doors Down,2000,7.000000,1.0,7.000000,1.0
3 Doors Down,2002,6.000000,1.0,6.000000,1.0
4 Non Blondes,1992,3.000000,1.0,3.000000,1.0
AC/DC,1975,52.500000,1.0,52.500000,1.0
AC/DC,1976,85.000000,1.0,85.000000,1.0
AC/DC,1977,3.000000,1.0,3.000000,1.0


In [9]:
# What we really want to take a look at next is PlayCount. So, 
# the F*G, First?, and Year? columns aren’t useful. Let’s 
# remove them and explicitly define the 'PlayCount' column by 
# using the values field.
pd.pivot_table(df,index=["ARTIST CLEAN","Release Year"],values=["PlayCount"])

Unnamed: 0_level_0,Unnamed: 1_level_0,PlayCount
ARTIST CLEAN,Release Year,Unnamed: 2_level_1
.38 Special,1980,18.000000
.38 Special,1981,85.000000
.38 Special,1982,82.000000
10cc,1975,1.000000
3 Doors Down,2000,7.000000
3 Doors Down,2002,6.000000
4 Non Blondes,1992,3.000000
AC/DC,1975,52.500000
AC/DC,1976,85.000000
AC/DC,1977,3.000000


In [15]:
# The 'PlayCount' column automatically averages the data.  
# We can do a count or a sum by using aggfunc and np.sum.
pd.pivot_table(df,index=["ARTIST CLEAN","Release Year"],values=["PlayCount"],aggfunc=np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,PlayCount
ARTIST CLEAN,Release Year,Unnamed: 2_level_1
.38 Special,1980,18
.38 Special,1981,85
.38 Special,1982,82
10cc,1975,1
3 Doors Down,2000,14
3 Doors Down,2002,6
4 Non Blondes,1992,3
AC/DC,1975,105
AC/DC,1976,85
AC/DC,1977,6
