In [2]:
import numpy as np
import pandas as pd

## Индексы Pandas

Если размерность данных > 2, то используется иерахическая индексация (мультииндекс). В один индекс включается несколько уровней

In [3]:
# город и год
index = [
    ('city_1', 2010),
    ('city_1', 2020),
    ('city_2', 2010),
    ('city_2', 2020),
    ('city_3', 2010),
    ('city_3', 2020)
]

population = [
    101,
    201,
    102,
    202,
    103,
    203,
]

pop = pd.Series(population, index=index)

print(pop[[i for i in pop.index if i[1] == 2020]]) # Кустарный способ

(city_1, 2020)    201
(city_2, 2020)    202
(city_3, 2020)    203
dtype: int64


Необходимо использовать готовое решение - мультииндекс

In [4]:
index = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(index)
print(pop) # В выводе видна иерархия !!!

print(pop[:, 2020])

#Конвертация мультииндексного Series в DataFrame
pop_df = pop.unstack()
print(pop_df)

#Обратная конвертация
print(pop_df.stack())

city_1  2010    101
        2020    201
city_2  2010    102
        2020    202
city_3  2010    103
        2020    203
dtype: int64
city_1    201
city_2    202
city_3    203
dtype: int64
        2010  2020
city_1   101   201
city_2   102   202
city_3   103   203
city_1  2010    101
        2020    201
city_2  2010    102
        2020    202
city_3  2010    103
        2020    203
dtype: int64


In [5]:
index = [
    ('city_1', 2010, 1),
    ('city_1', 2010, 2),
    ('city_1', 2020, 1),
    ('city_1', 2020, 2),
    ('city_2', 2010, 1),
    ('city_2', 2010, 2),
    ('city_2', 2020, 1),
    ('city_2', 2020, 2),
    ('city_3', 2010, 1),
    ('city_3', 2010, 2),
    ('city_3', 2020, 1),
    ('city_3', 2020, 2)
]
population = [
    101,
    1010,
    201,
    2010,
    102,
    1020,
    202,
    2020,
    103,
    1030,
    203,
    2030
]

pop = pd.Series(population, index=index)
print(pop)

index = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(index)
print(pop)

print(pop[:, 2010])
print(pop[:, :, 2])

print(pop.unstack()) # Преобразование в DataFrame по последнему индексу

(city_1, 2010, 1)     101
(city_1, 2010, 2)    1010
(city_1, 2020, 1)     201
(city_1, 2020, 2)    2010
(city_2, 2010, 1)     102
(city_2, 2010, 2)    1020
(city_2, 2020, 1)     202
(city_2, 2020, 2)    2020
(city_3, 2010, 1)     103
(city_3, 2010, 2)    1030
(city_3, 2020, 1)     203
(city_3, 2020, 2)    2030
dtype: int64
city_1  2010  1     101
              2    1010
        2020  1     201
              2    2010
city_2  2010  1     102
              2    1020
        2020  1     202
              2    2020
city_3  2010  1     103
              2    1030
        2020  1     203
              2    2030
dtype: int64
city_1  1     101
        2    1010
city_2  1     102
        2    1020
city_3  1     103
        2    1030
dtype: int64
city_1  2010    1010
        2020    2010
city_2  2010    1020
        2020    2020
city_3  2010    1030
        2020    2030
dtype: int64
               1     2
city_1 2010  101  1010
       2020  201  2010
city_2 2010  102  1020
       2020  202  2020

Детализация

In [8]:
# Можно ввести детализацию
index = [
    ('city_1', 2010, 1),
    ('city_1', 2010, 2),
    ('city_1', 2020, 1),
    ('city_1', 2020, 2),
    ('city_2', 2010, 1),
    ('city_2', 2010, 2),
    ('city_2', 2020, 1),
    ('city_2', 2020, 2),
    ('city_3', 2010, 1),
    ('city_3', 2010, 2),
    ('city_3', 2020, 1),
    ('city_3', 2020, 2)
]
population = [
    101,
    1010,
    201,
    2010,
    102,
    1020,
    202,
    2020,
    103,
    1030,
    203,
    2030
]
pop = pd.Series(population, index=index)
pop_df = pd.DataFrame(
    {
        'total': pop,
        'something':[
            10,
            11,
            12,
            13,
            14,
            15,
            16,
            17,
            18,
            19,
            20,
            21
        ]
    }
)
print(pop_df)
print(pop_df['something'])

                   total  something
(city_1, 2010, 1)    101         10
(city_1, 2010, 2)   1010         11
(city_1, 2020, 1)    201         12
(city_1, 2020, 2)   2010         13
(city_2, 2010, 1)    102         14
(city_2, 2010, 2)   1020         15
(city_2, 2020, 1)    202         16
(city_2, 2020, 2)   2020         17
(city_3, 2010, 1)    103         18
(city_3, 2010, 2)   1030         19
(city_3, 2020, 1)    203         20
(city_3, 2020, 2)   2030         21
(city_1, 2010, 1)    10
(city_1, 2010, 2)    11
(city_1, 2020, 1)    12
(city_1, 2020, 2)    13
(city_2, 2010, 1)    14
(city_2, 2010, 2)    15
(city_2, 2020, 1)    16
(city_2, 2020, 2)    17
(city_3, 2010, 1)    18
(city_3, 2010, 2)    19
(city_3, 2020, 1)    20
(city_3, 2020, 2)    21
Name: something, dtype: int64


## Cпособы создания мультииндексных ключей
- список массивов задающих значение на каждом уровне
- Из кортежей (см выше)
- Декартовое произведение обычных индексов
- Описание внутренного представления levels, codes

In [26]:
i1 = pd.MultiIndex.from_arrays([
    ['a', 'a', 'b', 'b'],
    [1, 2, 1, 2]
])
print(i1)

i3 = pd.MultiIndex.from_product([
    ['a', 'b'], [1, 2]
])
print(i3)

i4 = pd.MultiIndex(
    levels = [['a', 'b'], [1,2]],
    codes = [
        [0, 0, 1, 1], # a a b b
        [0, 1, 0, 1], # 1 2 1 2
    ]
)
print(i4)

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


In [27]:
# Уровням можно задать названия
data = {
    ('city_1', 2010) : 100,
    ('city_1', 2020) : 101,
    ('city_2', 2010) : 200,
    ('city_2', 2020) : 201,
}

s = pd.Series(data)
print(s)
s.index.names = ['city', 'year']
print(s)

city_1  2010    100
        2020    101
city_2  2010    200
        2020    201
dtype: int64
city    year
city_1  2010    100
        2020    101
city_2  2010    200
        2020    201
dtype: int64


In [32]:
#Мультииндексы для столбцов
index = pd.MultiIndex.from_product(
    [
        ['city_1', 'city_2'],
        [2010, 2020]
    ],
    names = ['city', 'year']
)

columns = pd.MultiIndex.from_product(
    [
        ['person_1', 'person_2', 'person_3'],
        ['job_1', 'job_2']
    ],
    names=['worker', 'job']
)

rng = np.random.default_rng(1)

data = rng.random((4, 6))

data_df = pd.DataFrame(data, index=index, columns=columns)
print(data_df)

worker       person_1            person_2            person_3          
job             job_1     job_2     job_1     job_2     job_1     job_2
city   year                                                            
city_1 2010  0.511822  0.950464  0.144160  0.948649  0.311831  0.423326
       2020  0.827703  0.409199  0.549594  0.027559  0.753513  0.538143
city_2 2010  0.329732  0.788429  0.303195  0.453498  0.134042  0.403113
       2020  0.203455  0.262313  0.750365  0.280409  0.485191  0.980737


In [34]:
# Срезы Series
data = {
    ('city_1', 2010) : 100,
    ('city_1', 2020) : 101,
    ('city_2', 2010) : 200,
    ('city_2', 2020) : 201,
    ('city_3', 2010) : 300,
    ('city_3', 2020) : 301,
}

s = pd.Series(data)
print(s)
s.index.names = ['city', 'year']
print(s['city_1', 2010])
print(s['city_1'])
print(s.loc[:, 2010])
print(s.loc['city_1':'city_2'])
print(s[s > 200])

city_1  2010    100
        2020    101
city_2  2010    200
        2020    201
city_3  2010    300
        2020    301
dtype: int64
100
year
2010    100
2020    101
dtype: int64
city
city_1    100
city_2    200
city_3    300
dtype: int64
city    year
city_1  2010    100
        2020    101
city_2  2010    200
        2020    201
dtype: int64
city    year
city_2  2020    201
city_3  2010    300
        2020    301
dtype: int64


In [38]:
# Перегруппировка мультииндексов

index = pd.MultiIndex.from_product([
    ['a', 'c', 'd'],
    [1, 2]
])
print(i3)
data = pd.Series(rng.random(6), index=index)
print(data)

# print(data['a':'b']) - не работает
data = data.sort_index()
print(data)
print(data['a':'b'])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )
a  1    0.592941
   2    0.260097
c  1    0.839882
   2    0.509496
d  1    0.510889
   2    0.753030
dtype: float64
a  1    0.592941
   2    0.260097
c  1    0.839882
   2    0.509496
d  1    0.510889
   2    0.753030
dtype: float64
a  1    0.592941
   2    0.260097
dtype: float64


In [40]:
index = [
    ('city_1', 2010, 1),
    ('city_1', 2010, 2),
    ('city_1', 2020, 1),
    ('city_1', 2020, 2),
    ('city_2', 2010, 1),
    ('city_2', 2010, 2),
    ('city_2', 2020, 1),
    ('city_2', 2020, 2),
    ('city_3', 2010, 1),
    ('city_3', 2010, 2),
    ('city_3', 2020, 1),
    ('city_3', 2020, 2)
]
population = [
    101,
    1010,
    201,
    2010,
    102,
    1020,
    202,
    2020,
    103,
    1030,
    203,
    2030
]

pop = pd.Series(population, index=index)

i = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(i)
print(pop.unstack(), '\n')
print(pop.unstack(level=0), '\n')
print(pop.unstack(level=1), '\n')
print(pop.unstack(level=2), '\n')

               1     2
city_1 2010  101  1010
       2020  201  2010
city_2 2010  102  1020
       2020  202  2020
city_3 2010  103  1030
       2020  203  2030 

        city_1  city_2  city_3
2010 1     101     102     103
     2    1010    1020    1030
2020 1     201     202     203
     2    2010    2020    2030 

          2010  2020
city_1 1   101   201
       2  1010  2010
city_2 1   102   202
       2  1020  2020
city_3 1   103   203
       2  1030  2030 

               1     2
city_1 2010  101  1010
       2020  201  2010
city_2 2010  102  1020
       2020  202  2020
city_3 2010  103  1030
       2020  203  2030 



In [45]:
# Конкатенация в Pandas - concat

serl1 = pd.Series(['a', 'b', 'c'], index=[1, 2, 3])
serl2 = pd.Series(['d', 'e', 'f'], index=[4, 5, 6])

print(pd.concat([serl1, serl2]))
serl2 = pd.Series(['d', 'e', 'f'], index=[1, 2, 6])
print(pd.concat([serl1, serl2]))
print(pd.concat([serl1, serl2], verify_integrity=False))
print(pd.concat([serl1, serl2], ignore_index=True))
print(pd.concat([serl1, serl2], keys=['x', 'y']))

1    a
2    b
3    c
4    d
5    e
6    f
dtype: object
1    a
2    b
3    c
1    d
2    e
6    f
dtype: object
1    a
2    b
3    c
1    d
2    e
6    f
dtype: object
0    a
1    b
2    c
3    d
4    e
5    f
dtype: object
x  1    a
   2    b
   3    c
y  1    d
   2    e
   6    f
dtype: object
