## Сводные таблицы

In [8]:
import numpy as np
import pandas as pd
import seaborn as sns

In [9]:
titanic = sns.load_dataset('titanic')

In [10]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


### Сводные таблицы "вручную"

In [11]:
titanic.groupby('sex')[['survived']].sum()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,233
male,109


In [12]:
titanic.groupby('sex')['survived'].sum()

sex
female    233
male      109
Name: survived, dtype: int64

In [13]:
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean')

sex     class 
female  First     0.968085
        Second    0.921053
        Third     0.500000
male    First     0.368852
        Second    0.157407
        Third     0.135447
Name: survived, dtype: float64

In [15]:
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


### Синтаксис сводных таблиц

In [16]:
titanic.pivot_table('survived', index='sex', columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [17]:
# titanic.pivot_table?
# values : column to aggregate, optional
# index : column, Grouper, array, or list of the previous

#     If an array is passed, it must be the same length as the data. The
#     list can contain any of the other types (except list).
#     Keys to group by on the pivot table index.  If an array is passed,
#     it is being used as the same manner as column values.

# columns : column, Grouper, array, or list of the previous

#     If an array is passed, it must be the same length as the data. The
#     list can contain any of the other types (except list).
#     Keys to group by on the pivot table column.  If an array is passed,
#     it is being used as the same manner as column values.

# aggfunc : function, list of functions, dict, default numpy.mean✌

#     If list of functions passed, the resulting pivot table will have
#     hierarchical columns whose top level are the function names
#     (inferred from the function objects themselves)
#     If dict is passed, the key is column to aggregate and value
#     is function or list of functions.

# fill_value : scalar, default None

#     Value to replace missing values with (in the resulting pivot table,
#     after aggregation).

# margins : bool, default False

#     Add all row / columns (e.g. for subtotal / grand totals).

# dropna : bool, default True

#     Do not include columns whose entries are all NaN.

# margins_name : str, default 'All'

#     Name of the row / column that will contain the totals
#     when margins is True.

# observed : bool, default False

#     This only applies if any of the groupers are Categoricals.
#     If True: only show observed values for categorical groupers.
#     If False: show all values for categorical groupers.

#     .. versionchanged:: 0.25.0

# sort : bool, default True

#     Specifies if the result should be sorted.

In [18]:
titanic.pivot_table('survived', index='sex', columns='class', aggfunc='sum')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,91,70,72
male,45,17,47


#### Многоуровневые сводные таблицы
    возраст в качестве третьего измерения

In [21]:
age = pd.cut(titanic['age'], [0, 18, 80])
age

0      (18.0, 80.0]
1      (18.0, 80.0]
2      (18.0, 80.0]
3      (18.0, 80.0]
4      (18.0, 80.0]
           ...     
886    (18.0, 80.0]
887    (18.0, 80.0]
888             NaN
889    (18.0, 80.0]
890    (18.0, 80.0]
Name: age, Length: 891, dtype: category
Categories (2, interval[int64, right]): [(0, 18] < (18, 80]]

In [22]:
titanic.pivot_table('survived', ['sex', age], 'class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


In [23]:
# pd.cut?

In [24]:
titanic.pivot_table('survived', ['sex', 'age'], 'class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.75,,,1.00
female,1.00,,,1.00
female,2.00,0.0,1.0,0.25
female,3.00,,1.0,0.00
female,4.00,,1.0,1.00
...,...,...,...,...
male,70.00,0.0,0.0,
male,70.50,,,0.00
male,71.00,0.0,,
male,74.00,,,0.00


In [25]:
# автоматическое вычисление квантилий
fare = pd.qcut(titanic['fare'], 2)
fare

0       (-0.001, 14.454]
1      (14.454, 512.329]
2       (-0.001, 14.454]
3      (14.454, 512.329]
4       (-0.001, 14.454]
             ...        
886     (-0.001, 14.454]
887    (14.454, 512.329]
888    (14.454, 512.329]
889    (14.454, 512.329]
890     (-0.001, 14.454]
Name: fare, Length: 891, dtype: category
Categories (2, interval[float64, right]): [(-0.001, 14.454] < (14.454, 512.329]]

In [26]:
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])

Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(0, 18]",,1.0,0.714286,0.909091,1.0,0.318182
female,"(18, 80]",,0.88,0.444444,0.972973,0.914286,0.391304
male,"(0, 18]",,0.0,0.26087,0.8,0.818182,0.178571
male,"(18, 80]",0.0,0.098039,0.125,0.391304,0.030303,0.192308


#### Дополнительные параметры сводных таблиц

In [27]:
titanic.pivot_table(index='sex', columns='class', aggfunc={'survived': sum, 'fare': 'mean'})
# первый параметр values пропущен, т.к. при задании aggfunc происходит его автоматическое определение

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


In [28]:
# итоги по каждой группе margins=True
titanic.pivot_table('survived', index='sex', columns='class', margins=True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


In [29]:
titanic.pivot_table('survived', index='sex', columns='class', margins=True, aggfunc=sum)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,91,70,72,233
male,45,17,47,109
All,136,87,119,342
