In [1]:
import numpy as np
import pandas as pd

Сегодня мы углубимся в объект Index библиотеки Pandas и посмотрим на несколько ситуаций, когда могут возникнуть сложности. А также узнаем, что такое мультииндекс, и как это поможет написать тебе НИР

# 1. Данные с разными индексами

In [137]:
cock = pd.Series({'Max': 7, 
                  'George': 10,
                  'Ann': 4}, 
                     name='cocktails')
time = pd.Series({'Ann': 1, 
                  'Polina': 3,
                  'Ujin': 2,
                  'Max': 2.5},
                     name='hours')

In [138]:
cock / time

Ann       4.0
George    NaN
Max       2.8
Polina    NaN
Ujin      NaN
dtype: float64

In [139]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [140]:
# нам пригодился add вместо +
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [141]:
A = pd.DataFrame(np.random.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,19,6
1,2,10


In [142]:
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,4,4,1
1,1,7,0
2,0,7,3


In [143]:
A + B

Unnamed: 0,A,B,C
0,23.0,10.0,
1,9.0,11.0,
2,,,


| Python Operator | Pandas Method(s)                      |
|-----------------|---------------------------------------|
| ``+``           | ``add()``                             |
| ``-``           | ``sub()``, ``subtract()``             |
| ``*``           | ``mul()``, ``multiply()``             |
| ``/``           | ``truediv()``, ``div()``, ``divide()``|
| ``//``          | ``floordiv()``                        |
| ``%``           | ``mod()``                             |
| ``**``          | ``pow()``                             |

Мы рассмотрим, что такое `NaN` и как с ним бороться в следующей тетрадке, а пока что продолжим говорить об индексах

# 2. Мультииндексы (обрабатываем панельные данные)

In [144]:
index = [('Max', 2017), ('Max', 2019),
         ('George', 2017), ('George', 2019),
         ('Ann', 2017), ('Ann', 2019)]
cocktails = [12,7,
             8,10,
             3,4]
cock = pd.Series(cocktails, index=index)
cock

(Max, 2017)       12
(Max, 2019)        7
(George, 2017)     8
(George, 2019)    10
(Ann, 2017)        3
(Ann, 2019)        4
dtype: int64

In [145]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['Ann', 'George', 'Max'], [2017, 2019]],
           labels=[[2, 2, 1, 1, 0, 0], [0, 1, 0, 1, 0, 1]])

In [146]:
cock = cock.reindex(index)
cock

Max     2017    12
        2019     7
George  2017     8
        2019    10
Ann     2017     3
        2019     4
dtype: int64

In [147]:
cock[:, 2017]

Max       12
George     8
Ann        3
dtype: int64

## 2.1 Открываем еще одно измерение

In [148]:
cock_df = cock.unstack(level=0)
cock_df

Unnamed: 0,Ann,George,Max
2017,3,8,12
2019,4,10,7


In [149]:
cock_df = cock.unstack(level=1)
cock_df

Unnamed: 0,2017,2019
Ann,3,4
George,8,10
Max,12,7


In [150]:
cock_df.stack()

Ann     2017     3
        2019     4
George  2017     8
        2019    10
Max     2017    12
        2019     7
dtype: int64

In [151]:
cock_ch = cock.reset_index()
cock_ch

Unnamed: 0,level_0,level_1,0
0,Max,2017,12
1,Max,2019,7
2,George,2017,8
3,George,2019,10
4,Ann,2017,3
5,Ann,2019,4


In [152]:
cock_ch.set_index(['level_0', 'level_1'])

Unnamed: 0_level_0,Unnamed: 1_level_0,0
level_0,level_1,Unnamed: 2_level_1
Max,2017,12
Max,2019,7
George,2017,8
George,2019,10
Ann,2017,3
Ann,2019,4


## 2.2 Создаем мультииндексы

In [153]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.943083,0.469156
a,2,0.456168,0.602809
b,1,0.517886,0.552703
b,2,0.186224,0.485996


#### Из массива

In [154]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

#### Из кортежей

In [155]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

#### Из декартового произведения

In [156]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

#### Подбирая вручную метки

In [157]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
              labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

## 2.3 Задаем названия

In [158]:
cock.index.names = ['name', 'year']
cock

name    year
Max     2017    12
        2019     7
George  2017     8
        2019    10
Ann     2017     3
        2019     4
dtype: int64

## 2.4 Мультииндексы для столбцов

In [159]:
index = pd.MultiIndex.from_product([[2017,2019], [1, 2]],
                                   names=['year', 'day'])
columns = pd.MultiIndex.from_product([['Max','George','Polina'], ['Strong', 'Light']],
                                     names=['name', 'cocktail'])

data = abs(np.round(np.random.normal(0,5,(4,6))))

huge_data = pd.DataFrame(data, index=index, columns=columns)
huge_data

Unnamed: 0_level_0,name,Max,Max,George,George,Polina,Polina
Unnamed: 0_level_1,cocktail,Strong,Light,Strong,Light,Strong,Light
year,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2017,1,2.0,5.0,1.0,5.0,3.0,4.0
2017,2,2.0,1.0,1.0,2.0,2.0,7.0
2019,1,0.0,0.0,3.0,4.0,2.0,2.0
2019,2,2.0,1.0,3.0,4.0,1.0,8.0


In [160]:
huge_data['Polina']

Unnamed: 0_level_0,cocktail,Strong,Light
year,day,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,1,3.0,4.0
2017,2,2.0,7.0
2019,1,2.0,2.0
2019,2,1.0,8.0


## 2.5 Индексирование с мультииндексом

In [161]:
cock

name    year
Max     2017    12
        2019     7
George  2017     8
        2019    10
Ann     2017     3
        2019     4
dtype: int64

In [162]:
cock['Ann', 2019]

4

In [163]:
cock['Max']

year
2017    12
2019     7
dtype: int64

In [164]:
cock[:, 2017]

name
Max       12
George     8
Ann        3
dtype: int64

In [165]:
cock[cock > 5]

name    year
Max     2017    12
        2019     7
George  2017     8
        2019    10
dtype: int64

In [166]:
cock[['George', 'Ann']]

name    year
George  2017     8
        2019    10
Ann     2017     3
        2019     4
dtype: int64

In [167]:
huge_data

Unnamed: 0_level_0,name,Max,Max,George,George,Polina,Polina
Unnamed: 0_level_1,cocktail,Strong,Light,Strong,Light,Strong,Light
year,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2017,1,2.0,5.0,1.0,5.0,3.0,4.0
2017,2,2.0,1.0,1.0,2.0,2.0,7.0
2019,1,0.0,0.0,3.0,4.0,2.0,2.0
2019,2,2.0,1.0,3.0,4.0,1.0,8.0


In [170]:
huge_data['Polina', 'Strong']

year  day
2017  1      3.0
      2      2.0
2019  1      2.0
      2      1.0
Name: (Polina, Strong), dtype: float64

In [171]:
huge_data.iloc[:4, :2]

Unnamed: 0_level_0,name,Max,Max
Unnamed: 0_level_1,cocktail,Strong,Light
year,day,Unnamed: 2_level_2,Unnamed: 3_level_2
2017,1,2.0,5.0
2017,2,2.0,1.0
2019,1,0.0,0.0
2019,2,2.0,1.0


In [172]:
huge_data.loc[:, (('Max','Polina'), 'Strong')]

Unnamed: 0_level_0,name,Max,Polina
Unnamed: 0_level_1,cocktail,Strong,Strong
year,day,Unnamed: 2_level_2,Unnamed: 3_level_2
2017,1,2.0,3.0
2017,2,2.0,2.0
2019,1,0.0,2.0
2019,2,2.0,1.0


In [173]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'Light']]

Unnamed: 0_level_0,name,Max,George,Polina
Unnamed: 0_level_1,cocktail,Light,Light,Light
year,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2017,1,1.0,8.0,2.0
2019,1,11.0,4.0,1.0


## 2.6 Агрегирующие функции для мультииндексов

In [174]:
huge_data

Unnamed: 0_level_0,name,Max,Max,George,George,Polina,Polina
Unnamed: 0_level_1,cocktail,Strong,Light,Strong,Light,Strong,Light
year,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2017,1,2.0,5.0,1.0,5.0,3.0,4.0
2017,2,2.0,1.0,1.0,2.0,2.0,7.0
2019,1,0.0,0.0,3.0,4.0,2.0,2.0
2019,2,2.0,1.0,3.0,4.0,1.0,8.0


In [175]:
# вычисляем среднее число коктейлей для каждого года в независимости от дня в году 
data_mean = huge_data.mean(level='year')
data_mean

name,Max,Max,George,George,Polina,Polina
cocktail,Strong,Light,Strong,Light,Strong,Light
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2017,2.0,3.0,1.0,3.5,2.5,5.5
2019,1.0,0.5,3.0,4.0,1.5,5.0


In [179]:
# среднее по столбцам
data_mean.mean(axis=1, level='name')

name,Max,George,Polina
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,2.5,2.25,4.0
2019,0.75,3.5,3.25
