# Pandas索引

In [1]:
import pandas as pd

In [6]:
from io import StringIO

In [103]:
pd.Index?

pd.Index(

    data=None,
    dtype=None,
    copy=False,
    name=None,
    tupleize_cols=True,
    **kwargs,
) -> 'Index'

In [104]:
pd.Index([*'abc'])

Index(['a', 'b', 'c'], dtype='object')

In [55]:
df = pd.read_csv(StringIO('a,b,c\naa,1,2\nbb,3,4\ncc,5,6'))

In [56]:
df

Unnamed: 0,a,b,c
0,aa,1,2
1,bb,3,4
2,cc,5,6


In [58]:
pd.read_csv(StringIO('a,b,c\naa,1,2\nbb,3,4\ncc,5,6'), index_col=['a'])

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,1,2
bb,3,4
cc,5,6


In [59]:
pd.read_csv(StringIO('a,b,c\naa,1,2\nbb,3,4\ncc,5,6'), index_col='a')

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,1,2
bb,3,4
cc,5,6


In [60]:
pd.read_csv(StringIO('a,b,c\naa,1,2\nbb,3,4\ncc,5,6'), index_col=0)

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,1,2
bb,3,4
cc,5,6


In [79]:
arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))

MultiIndex([(1,  'red'),
            (1, 'blue'),
            (2,  'red'),
            (2, 'blue')],
           names=['number', 'color'])

In [46]:
dfm = pd.read_csv(
    StringIO('a,b,c,d\naa,a,2,1\nbb,b,3,4\ncc,a,5,6\naa,a,5,6'), 
    index_col=[0,1]
)

In [51]:
dfa = pd.read_csv(
    StringIO('a,b,c,d\naa,a,2,1\naa,b,3,4\ncc,a,5,6\naa,a,5,6'), 
    index_col=[0,1]
)

In [52]:
dfa

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
aa,a,2,1
aa,b,3,4
cc,a,5,6
aa,a,5,6


In [53]:
dfa.index

MultiIndex([('aa', 'a'),
            ('aa', 'b'),
            ('cc', 'a'),
            ('aa', 'a')],
           names=['a', 'b'])

In [47]:
dfm

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
aa,a,2,1
bb,b,3,4
cc,a,5,6
aa,a,5,6


In [40]:
dfm.index

MultiIndex([('aa', 'a'),
            ('bb', 'b'),
            ('cc', 'a'),
            ('aa', 'a')],
           names=['a', 'b'])

In [43]:
dfm.shape

(4, 2)

In [65]:
dfs = pd.read_csv(StringIO('a,b,c\naa,1,2\nbb,3,4\ncc,5,6'), index_col=0)

In [66]:
dfs

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,1,2
bb,3,4
cc,5,6


In [67]:
dfs['aa': 'cc']

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,1,2
bb,3,4
cc,5,6


In [70]:
dfa = pd.read_csv(StringIO('a,b,c\naa,1,2\nbb,3,4\ncc,5,6\naa,7,8'), index_col=0)

In [71]:
dfa

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,1,2
bb,3,4
cc,5,6
aa,7,8


In [72]:
# 存在重复的索引, 进行切片(切片涵盖该冲突的label)
dfa['aa': 'cc']

KeyError: "Cannot get left slice bound for non-unique label: 'aa'"

In [62]:
dfm[0: 2]

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
aa,a,2,1
bb,b,3,4


In [64]:
# 多层索引的访问
dfm.loc['aa', 'c']

  dfm.loc['aa', 'c']


b
a    2
a    5
Name: c, dtype: int64

In [74]:
# 访问次级的index
dfm.loc['a', 'c']

  dfm.loc['a', 'c']


KeyError: 'a'

In [75]:
dfm.loc[('aa', 'a'), 'c']

  dfm.loc[('aa', 'a'), 'c']


a   b
aa  a    2
    a    5
Name: c, dtype: int64

In [76]:
# 对jindex进行排序后相关的警告消失
tmp = dfm.sort_index()

In [77]:
tmp.index

MultiIndex([('aa', 'a'),
            ('aa', 'a'),
            ('bb', 'b'),
            ('cc', 'a')],
           names=['a', 'b'])

In [78]:
tmp.loc[('aa', 'a'), 'c']

a   b
aa  a    2
    a    5
Name: c, dtype: int64

In [80]:
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
aa,a,2,1
aa,a,5,6
bb,b,3,4
cc,a,5,6


## 修改index

In [98]:
df.index

Index(['aa', 'bb', 'cc'], dtype='object', name='a')

In [99]:
df.index= [*'abc']

In [100]:
df.index

Index(['a', 'b', 'c'], dtype='object')

df.reset_index(

    level: 'Hashable | Sequence[Hashable] | None' = None,
    drop: 'bool' = False,
    inplace: 'bool' = False,
    col_level: 'Hashable' = 0,
    col_fill: 'Hashable' = '',
) -> 'DataFrame | None'

In [101]:
df.reset_index()

Unnamed: 0,index,b,c
0,a,1,2
1,b,3,4
2,c,5,6


In [106]:
df.sort_values(by='b',ascending=False)

Unnamed: 0,b,c
c,5,6
b,3,4
a,1,2


In [107]:
df.sort_values(by='b',ascending=False).reset_index()

Unnamed: 0,index,b,c
0,c,5,6
1,b,3,4
2,a,1,2


In [108]:
dfb = pd.DataFrame(data=[['a', 1,2], ['b', 2,3], ['a', 1,9], ['b', 3,4]])

In [110]:
dfb.columns = ['aa', 'bb', 'cc']

In [111]:
dfb

Unnamed: 0,aa,bb,cc
0,a,1,2
1,b,2,3
2,a,1,9
3,b,3,4


dfb.groupby(

    by=None,
    axis: 'Axis' = 0,
    level: 'Level | None' = None,
    as_index: 'bool' = True,
    sort: 'bool' = True,
    group_keys: 'bool' = True,
    squeeze: 'bool | lib.NoDefault' = <no_default>,
    observed: 'bool' = False,
    dropna: 'bool' = True,
) -> 'DataFrameGroupBy'

In [117]:
dfb.groupby(by=['aa']).mean()

Unnamed: 0_level_0,bb,cc
aa,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.0,5.5
b,2.5,3.5


In [116]:
dfb.groupby(by=['aa'], as_index=False).mean()

Unnamed: 0,aa,bb,cc
0,a,1.0,5.5
1,b,2.5,3.5


## 日期索引

- DatetimeIndex

- PeriodIndex

In [81]:
pd.DatetimeIndex?

pd.DatetimeIndex(

    data=None,
    freq=<no_default>,
    tz=None,
    normalize: 'bool' = False,
    closed=None,
    ambiguous='raise',
    dayfirst: 'bool' = False,
    yearfirst: 'bool' = False,
    dtype: 'Dtype | None' = None,
    copy: 'bool' = False,
    name: 'Hashable' = None,
) -> 'DatetimeIndex'

In [83]:
pd.date_range?

pd.date_range(

    start=None,
    end=None,
    periods=None,
    freq=None,
    tz=None,
    normalize: 'bool' = False,
    name: 'Hashable' = None,
    closed: 'str | None | lib.NoDefault' = <no_default>,
    inclusive: 'str | None' = None,
    **kwargs,
) -> 'DatetimeIndex'

In [93]:
dft = pd.DataFrame(range(10), index=pd.date_range('2001-10-1', periods=10))

In [94]:
dft

Unnamed: 0,0
2001-10-01,0
2001-10-02,1
2001-10-03,2
2001-10-04,3
2001-10-05,4
2001-10-06,5
2001-10-07,6
2001-10-08,7
2001-10-09,8
2001-10-10,9


In [95]:
dft.index

DatetimeIndex(['2001-10-01', '2001-10-02', '2001-10-03', '2001-10-04',
               '2001-10-05', '2001-10-06', '2001-10-07', '2001-10-08',
               '2001-10-09', '2001-10-10'],
              dtype='datetime64[ns]', freq='D')

In [118]:
dft.loc[dft.index.day > 3]

Unnamed: 0,0
2001-10-04,3
2001-10-05,4
2001-10-06,5
2001-10-07,6
2001-10-08,7
2001-10-09,8
2001-10-10,9


In [119]:
dft.loc[dft.index.weekday == 3]

Unnamed: 0,0
2001-10-04,3


In [120]:
dft.index.weekday

Int64Index([0, 1, 2, 3, 4, 5, 6, 0, 1, 2], dtype='int64')

利用pandas dateindex的这个特性, 在针对处理数据时, 将变得非常简单

例如查看商品在星期天的销量