# Pivot tables

- Pivot table is the multi-dimensional version of droupby
- It means splitting and combining happen across a two-dimensional grid

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# Groupby

In [5]:
# group by the data based on gender and survival

titanic.groupby('sex')['survived'].mean()
# same as 
# titanic.groupby('sex')['survived'].agg('mean')

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [6]:
titanic.groupby(['sex', 'class'])['survived'].agg('mean')

sex     class 
female  First     0.968085
        Second    0.921053
        Third     0.500000
male    First     0.368852
        Second    0.157407
        Third     0.135447
Name: survived, dtype: float64

In [64]:
titanic.groupby(['sex', 'class'])['survived'].agg('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [69]:
# same as the above
# default aggfunc=np.mean

titanic.pivot_table('survived', index='sex', columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [71]:
# margins=True shows the number of all groups
# e.g. 0.742038: the mean of all female
titanic.pivot_table('survived', index='sex', columns='class', aggfunc=np.mean, margins=True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


In [72]:
titanic.pivot_table('survived', index='sex', columns='class', aggfunc=np.sum, margins=True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,91,70,72,233
male,45,17,47,109
All,136,87,119,342


In [73]:
# rename the margins
titanic.pivot_table('survived', index='sex', columns='class', margins=True, margins_name='Survived total female')

class,First,Second,Third,Survived total female
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
Survived total female,0.62963,0.472826,0.242363,0.383838


# Multi-level pivot tables

In [6]:
# add 'age' as the third dimension and create a bin

# say we want to2 groups: 0-18 and 18-80
# 0      (18.0, 80.0] means that 0th value is in group 18-80s
age = pd.cut(titanic['age'], [0, 18, 80])
age

0      (18.0, 80.0]
1      (18.0, 80.0]
2      (18.0, 80.0]
3      (18.0, 80.0]
4      (18.0, 80.0]
           ...     
886    (18.0, 80.0]
887    (18.0, 80.0]
888             NaN
889    (18.0, 80.0]
890    (18.0, 80.0]
Name: age, Length: 891, dtype: category
Categories (2, interval[int64, right]): [(0, 18] < (18, 80]]

In [15]:
titanic.pivot_table('survived', index=['sex', age], columns='class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


In [19]:
# cut 是平分數值，qcut 是幾分位數 (例如四分位數)
fare = pd.qcut(titanic['fare'], 2)
fare

0       (-0.001, 14.454]
1      (14.454, 512.329]
2       (-0.001, 14.454]
3      (14.454, 512.329]
4       (-0.001, 14.454]
             ...        
886     (-0.001, 14.454]
887    (14.454, 512.329]
888    (14.454, 512.329]
889    (14.454, 512.329]
890     (-0.001, 14.454]
Name: fare, Length: 891, dtype: category
Categories (2, interval[float64, right]): [(-0.001, 14.454] < (14.454, 512.329]]

In [21]:
titanic.pivot_table('survived', index=['sex', age], columns=[fare, 'class'])

Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(0, 18]",,1.0,0.714286,0.909091,1.0,0.318182
female,"(18, 80]",,0.88,0.444444,0.972973,0.914286,0.391304
male,"(0, 18]",,0.0,0.26087,0.8,0.818182,0.178571
male,"(18, 80]",0.0,0.098039,0.125,0.391304,0.030303,0.192308


In [22]:
pd.pivot_table

<function pandas.core.reshape.pivot.pivot_table(data: 'DataFrame', values=None, index=None, columns=None, aggfunc: 'AggFuncType' = 'mean', fill_value=None, margins: 'bool' = False, dropna: 'bool' = True, margins_name: 'str' = 'All', observed: 'bool' = False, sort: 'bool' = True) -> 'DataFrame'>

In [9]:
# 'survived' and 'fare' are real column names
# it is not like that we can name the columns arbitrarily in the table
titanic.pivot_table(index='sex', columns='class', aggfunc={'survived': 'sum', 'fare': 'mean'})

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


# Vectored string operations

In [18]:
names = ['peter', 'Paul', 'MARY', 'GuiDO']
[name.capitalize() for name in names]

['Peter', 'Paul', 'Mary', 'Guido']

In [19]:
# but it can't handle missing values
names.append(None)
print(names)
[name.capitalize() for name in names]

['peter', 'Paul', 'MARY', 'GuiDO', None]


AttributeError: 'NoneType' object has no attribute 'capitalize'

In [20]:
# however, pandas str attribute handles the problem of missing value efficiently

names = pd.Series(names)
names

0    peter
1     Paul
2     MARY
3    GuiDO
4     None
dtype: object

In [21]:
names.str.capitalize()

0    Peter
1     Paul
2     Mary
3    Guido
4     None
dtype: object

# Pandas string methods

In [22]:
people = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam'])
people

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
dtype: object

In [23]:
people.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
dtype: object

In [24]:
people.str.len()

0    14
1    11
2    13
dtype: int64

# Time series

- Python's build-in datatime and dateutil provide efficient storage.
- Numpy datetime64 provides vectorized interface.
- Pandas can construct a DatetimeIndex that can be used to index data in a series or dataframe.
- pd.to_datetime()

In [25]:
date = pd.to_datetime('4th July, 2015')
date

Timestamp('2015-07-04 00:00:00')

In [26]:
# what the fuck is it?????
date.strftime('%A')

'Saturday'

In [27]:
# this produces a period of time
# D: the frequency is a day
# np.arange(12) = 0-11, so date actually combines with a 11-day peroid?

date + pd.to_timedelta(np.arange(12), 'D')

DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
               '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
               '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
              dtype='datetime64[ns]', freq=None)

# Pandas time series: indexing by time

In [28]:
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04', '2015-07-04', '2015-08-04'])
date = pd.Series([0,1,2,3], index=index)
date

2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64

In [42]:
# slicing
# 注意不用數字 slicing 會包含尾巴
date['2014-07-04': '2015-07-04']

2014-07-04    0
2014-08-04    1
2015-07-04    2
dtype: int64

In [43]:
# selecting by only year
date['2015']

2015-07-04    2
2015-08-04    3
dtype: int64

- TimeStamp can handle a variety of inputs like string, float, int
- TimeDelta is nothing but the difference in time
- TimePeriod references a specific length of a time between a start and end timestamp which doesn't overlap

In [44]:
from datetime import datetime
dates = pd.to_datetime([datetime(2015, 7, 3), '4th July, 2015', '2015-jul-4', '07-07-2015', '20150708'])
dates

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-04', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)

In [45]:
# 算出與第一天的天數差距
dates - dates[0]

TimedeltaIndex(['0 days', '1 days', '1 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)

# Regular sequence: pd.date_range()

- pd.date_range() is for timestamp and takes a start date, end date and an optional frequency code
- pd.period_range() is for period
- pd timedelta_range() for time delta

In [47]:
pd.date_range('2015-07-03', '2015-07-10') # the default frequency is a day

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [49]:
pd.date_range('2015-07-03', periods=8, freq='H')

DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',
               '2015-07-03 02:00:00', '2015-07-03 03:00:00',
               '2015-07-03 04:00:00', '2015-07-03 05:00:00',
               '2015-07-03 06:00:00', '2015-07-03 07:00:00'],
              dtype='datetime64[ns]', freq='H')