Series和Datafram索引的原理一样，我们以Dataframe的索引为主来学习
* 列索引：df['列名']  （Series不存在列索引）
* 行索引：df.loc[]、df.iloc[]

选择列 / 选择行 / 切片 / 布尔判断

In [1]:
import numpy as np
import pandas as pd  
# 导入numpy、pandas模块

In [2]:
# 选择行与列

df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
                   index = ['one','two','three'],
                   columns = ['a','b','c','d'])
print(df)

data1 = df['a']
data2 = df[['a','c']]
print(data1,type(data1))
print(data2,type(data2))
print('-----')
# 按照列名选择列，只选择一列输出Series，选择多列输出Dataframe

data3 = df.loc['one']
data4 = df.loc[['one','two']]
print(data2,type(data3))
print(data3,type(data4))
# 按照index选择行，只选择一行输出Series，选择多行输出Dataframe

               a          b          c          d
one     5.191896  33.756807  55.531059  48.271119
two    73.611065  25.943409  63.896590  10.736052
three  82.450101  45.914238  37.840761  64.896341
one       5.191896
two      73.611065
three    82.450101
Name: a, dtype: float64 <class 'pandas.core.series.Series'>
               a          c
one     5.191896  55.531059
two    73.611065  63.896590
three  82.450101  37.840761 <class 'pandas.core.frame.DataFrame'>
-----
               a          c
one     5.191896  55.531059
two    73.611065  63.896590
three  82.450101  37.840761 <class 'pandas.core.series.Series'>
a     5.191896
b    33.756807
c    55.531059
d    48.271119
Name: one, dtype: float64 <class 'pandas.core.frame.DataFrame'>


In [3]:
# df[] - 选择列
# 一般用于选择列，也可以选择行，但不推荐，行索引用.loc与.iloc

df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
                   index = ['one','two','three'],
                   columns = ['a','b','c','d'])
print(df)
print('-----')

data1 = df['a']
data2 = df[['b','c']]  # 尝试输入 data2 = df[['b','c','e']]
print(data1)
print(data2)
# df[]默认选择列，[]中写列名（所以一般数据colunms都会单独制定，不会用默认数字列名，以免和index冲突）
# 单选列为Series，print结果为Series格式
# 多选列为Dataframe，print结果为Dataframe格式

# 核心笔记：df[col]一般用于选择列，[]中写列名

               a          b          c          d
one    32.302368  89.444542  70.904647   3.899547
two    71.309217  63.006986  73.751675  34.063717
three  13.534943  84.102451  48.329891  33.537992
-----
one      32.302368
two      71.309217
three    13.534943
Name: a, dtype: float64
               b          c
one    89.444542  70.904647
two    63.006986  73.751675
three  84.102451  48.329891


In [4]:
# df.loc[] - 按index选择行

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df1)
print(df2)
print('-----')

data1 = df1.loc['one']
data2 = df2.loc[1]
print(data1)
print(data2)
print('单标签索引\n-----')
# 单个标签索引，返回Series

data3 = df1.loc[['two','three','five']]
data4 = df2.loc[[3,2,1]]
print(data3)
print(data4)
print('多标签索引\n-----')
# 多个标签索引，如果标签不存在，则返回NaN
# 顺序可变
# 这里‘five’标签不存在，所以有警告

data5 = df1.loc['one':'three']
data6 = df2.loc[1:3]
print(data5)
print(data6)
print('切片索引')
# 可以做切片对象
# 末端包含

# 核心笔记：df.loc[label]主要针对index选择行，同时支持指定index，及默认数字index

               a          b          c          d
one    41.473536  36.036192  61.836041  13.373447
two    83.709165  96.248540  31.266231  84.736594
three  48.617461  82.627569  68.185809  71.803329
four   38.772901  89.275885  84.279757  78.687116
           a          b          c          d
0   1.387796  39.795388  12.439624  20.428982
1  88.289011  47.849035  50.188306  77.745736
2  20.914579  13.127105  28.333499  73.411151
3  27.545903  89.901712  14.438023  81.676334
-----
a    41.473536
b    36.036192
c    61.836041
d    13.373447
Name: one, dtype: float64
a    88.289011
b    47.849035
c    50.188306
d    77.745736
Name: 1, dtype: float64
单标签索引
-----
               a          b          c          d
two    83.709165  96.248540  31.266231  84.736594
three  48.617461  82.627569  68.185809  71.803329
five         NaN        NaN        NaN        NaN
           a          b          c          d
3  27.545903  89.901712  14.438023  81.676334
2  20.914579  13.127105  28.333499  73.4

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [5]:
# df.iloc[] - 按照整数位置（从轴的0到length-1）选择行
# 类似list的索引，其顺序就是dataframe的整数位置，从0开始计

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
print(df)
print('------')

print(df.iloc[0])
print(df.iloc[-1])
#print(df.iloc[4])
print('单位置索引\n-----')
# 单位置索引
# 和loc索引不同，不能索引超出数据行数的整数位置

print(df.iloc[[0,2]])
print(df.iloc[[3,2,1]])
print('多位置索引\n-----')
# 多位置索引
# 顺序可变

print(df.iloc[1:3])
print(df.iloc[::2])
print('切片索引')
# 切片索引
# 末端不包含

               a          b          c          d
one    54.801913  89.255351  98.658589  94.711159
two    83.046566  60.027028  35.391679  64.098591
three  20.234443  35.438036  60.514223  78.569194
four   64.699287  41.698551  88.225082  43.686279
------
a    54.801913
b    89.255351
c    98.658589
d    94.711159
Name: one, dtype: float64
a    64.699287
b    41.698551
c    88.225082
d    43.686279
Name: four, dtype: float64
单位置索引
-----
               a          b          c          d
one    54.801913  89.255351  98.658589  94.711159
three  20.234443  35.438036  60.514223  78.569194
               a          b          c          d
four   64.699287  41.698551  88.225082  43.686279
three  20.234443  35.438036  60.514223  78.569194
two    83.046566  60.027028  35.391679  64.098591
多位置索引
-----
               a          b          c          d
two    83.046566  60.027028  35.391679  64.098591
three  20.234443  35.438036  60.514223  78.569194
               a          b          c        

In [6]:
# 布尔型索引
# 多用于索引行

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
print(df)
print('------')

b1 = df < 20
print(b1,type(b1))
print(df[b1])  # 也可以书写为 df[df < 20]
print('------')
# 不做索引则会对数据每个值进行判断
# 索引结果保留 所有数据：True返回原数据，False返回值为NaN

b2 = df['a'] > 50
print(b2,type(b2))
print(df[b2])  # 也可以书写为 df[df['a'] > 50]
print('------')
# 单列做判断
# 索引结果保留 单列判断为True的行数据，包括其他列

b3 = df[['a','b']] > 50
print(b3,type(b3))
print(df[b3])  # 也可以书写为 df[df[['a','b']] > 50]
print('------')
# 多列做判断
# 索引结果保留 所有数据：True返回原数据，False返回值为NaN
# 注意这里报错的话，更新一下pandas → conda update pandas

b4 = df.loc[['one','three']] < 50
print(b4,type(b4))
print(df[b4])  # 也可以书写为 df[df.loc[['one','three']] < 50]
print('------')
# 多行做判断
# 索引结果保留 所有数据：True返回原数据，False返回值为NaN

               a          b          c          d
one     9.970915  10.403036  61.548723  33.807531
two    64.147530  20.837293  47.027831  24.937798
three  79.262523   6.433300   6.151607  96.142251
four   97.723110   8.283644  70.768830   6.523142
------
           a      b      c      d
one     True   True  False  False
two    False  False  False  False
three  False   True   True  False
four   False   True  False   True <class 'pandas.core.frame.DataFrame'>
              a          b         c         d
one    9.970915  10.403036       NaN       NaN
two         NaN        NaN       NaN       NaN
three       NaN   6.433300  6.151607       NaN
four        NaN   8.283644       NaN  6.523142
------
one      False
two       True
three     True
four      True
Name: a, dtype: bool <class 'pandas.core.series.Series'>
               a          b          c          d
two    64.147530  20.837293  47.027831  24.937798
three  79.262523   6.433300   6.151607  96.142251
four   97.723110   8.28364

In [7]:
# 多重索引：比如同时索引行和列
# 先选择列再选择行 —— 相当于对于一个数据，先筛选字段，再选择数据量

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
print(df)
print('------')

print(df['a'].loc[['one','three']])   # 选择a列的one，three行
print(df[['b','c','d']].iloc[::2])   # 选择b，c，d列的one，three行
print(df[df['a'] < 50].iloc[:2])   # 选择满足判断索引的前两行数据

               a          b          c          d
one    98.661560  29.514835  54.308770  85.895547
two    27.937505   7.272639  38.820131  93.830862
three  75.479305  80.195558  16.024623  63.068741
four   48.927145  38.935594  18.076788  48.773935
------
one      98.661560
three    75.479305
Name: a, dtype: float64
               b          c          d
one    29.514835  54.308770  85.895547
three  80.195558  16.024623  63.068741
              a          b          c          d
two   27.937505   7.272639  38.820131  93.830862
four  48.927145  38.935594  18.076788  48.773935
