# 索引

In [2]:
import pandas as pd
import numpy as np

In [3]:
dict_data = {'A': 1,
             'B': pd.Timestamp('20190926'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([1,2,3,4],dtype='int32'),
             'E': ["Python","Java","C++","C"],
             'F': 'wangdao' }
df_obj2 = pd.DataFrame(dict_data)
print(df_obj2.index)

Index([0, 1, 2, 3], dtype='int64')


# 常见的Index种类
•Index，索引  可以是各种类型
•Int64Index，整数索引
•MultiIndex，层级索引，难度较大
•DatetimeIndex，时间戳类型

In [4]:
ser_obj = pd.Series(range(5), index = list("abcde"))
print(ser_obj)
ser_obj.index

a    0
b    1
c    2
d    3
e    4
dtype: int64


Index(['a', 'b', 'c', 'd', 'e'], dtype='str')

In [5]:
print(ser_obj.loc['b']) #索引名
print(ser_obj.iloc[2]) #位置索引

1
2


In [6]:
# 切片索引/连续索引
print(ser_obj.iloc[1:3])  #索引位置取数据，左闭右开
print(ser_obj.loc['b':'d'])  #记住索引名  左闭右闭，因为索引名需要取到才知道到了要到的位置，但位置不需要

b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64


In [7]:
# 不连续索引
print(ser_obj.iloc[[0, 2, 4]])
print(ser_obj.loc[['a', 'e']])

a    0
c    2
e    4
dtype: int64
a    0
e    4
dtype: int64


In [8]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_obj)
print(ser_bool)

a    0
b    1
c    2
d    3
e    4
dtype: int64
a    False
b    False
c    False
d     True
e     True
dtype: bool


In [9]:
print('-'*50)
print(ser_obj[ser_bool])
print(ser_obj[ser_obj > 2]) #取出大于2的元素

--------------------------------------------------
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64


## DataFrame索引

In [10]:
import numpy as np
df_obj = pd.DataFrame(np.random.randn(5,4),columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0 -2.937265  1.248545  1.700859  0.153383
1 -0.291012  0.461235 -1.764043 -0.843810
2 -0.996522  0.864565  0.564925 -0.724074
3 -1.151219  1.294648  1.200267 -0.076491
4  0.478739  1.879695 -1.447132  0.762688


In [11]:
# 列索引
print(df_obj['a']) # 返回Series类型
print('-'*50)
print(df_obj[['a']]) # 多加一层[]，返回DataFrame类型
print('-'*50)
print(type(df_obj[['a']])) # 返回DataFrame类型

0   -2.937265
1   -0.291012
2   -0.996522
3   -1.151219
4    0.478739
Name: a, dtype: float64
--------------------------------------------------
          a
0 -2.937265
1 -0.291012
2 -0.996522
3 -1.151219
4  0.478739
--------------------------------------------------
<class 'pandas.DataFrame'>


loc 标签索引(通过索引标签值获取数据)

In [12]:
# 标签索引，建议使用loc，效率更高
# Series
print(ser_obj.loc['b':'d']) #前闭后闭

a    0
b    1
c    2
d    3
e    4
dtype: int64
b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64
--------------------------------------------------


In [13]:
# DataFrame
df_obj = pd.DataFrame(np.random.randn(5,4),columns = list('abcd'),index=list('abcde'))
print(df_obj)
print('-'*50)
print(df_obj['a'])  #建议不用,拿的是列
print('-'*50)
print(df_obj.loc['a'])  #拿的是行
print('-'*50)


          a         b         c         d
a -0.724929  0.677077 -0.791651 -0.316745
b -0.001068 -0.127204 -0.681314  3.231601
c -0.333347 -1.496209 -0.374502  1.307881
d -0.481801  0.137053  0.979537 -1.947072
e -0.936224  0.529582 -0.015198  0.056315
--------------------------------------------------
a   -0.724929
b   -0.001068
c   -0.333347
d   -0.481801
e   -0.936224
Name: a, dtype: float64
--------------------------------------------------
a   -0.724929
b    0.677077
c   -0.791651
d   -0.316745
Name: a, dtype: float64
--------------------------------------------------


In [14]:
# 第一个参数为索引行，第二个参数为列,loc或者iloc效率高于直接用取下标的方式，前闭后闭
print(df_obj.loc['a':'c', 'b':'d']) #连续索引
print(df_obj.loc[['a','c'], ['b','d']]) #不连续索引
print(df_obj.loc[['c'],['b']]) #取一个值,加[]，返回的是DataFrame类型
print(df_obj.loc['c','b'])  #取一个值，不加[]，为

          b         c         d
a  0.677077 -0.791651 -0.316745
b -0.127204 -0.681314  3.231601
c -1.496209 -0.374502  1.307881
          b         d
a  0.677077 -0.316745
c -1.496209  1.307881
          b
c -1.496209
-1.4962092051184015


## iloc 位置索引

In [15]:
# Series
print(ser_obj[1:3])
print('-'*50)
print(ser_obj.iloc[1:3]) # 前闭后开[)，效率高


--------------------------------------------------
b    1
c    2
dtype: int64
--------------------------------------------------
b    1
c    2
dtype: int64


In [16]:
df_obj

Unnamed: 0,a,b,c,d
a,-0.724929,0.677077,-0.791651,-0.316745
b,-0.001068,-0.127204,-0.681314,3.231601
c,-0.333347,-1.496209,-0.374502,1.307881
d,-0.481801,0.137053,0.979537,-1.947072
e,-0.936224,0.529582,-0.015198,0.056315


In [17]:

# DataFrame，iloc是前闭后开[)
print(df_obj)
print('-'*50)
print(df_obj.iloc[0:2, 0:2]) 
print('-'*50)
print(df_obj.iloc[[0,2], [0,2]]) # 不连续索引
print('-'*50)
print(df_obj.iloc[0,0]) # 取一个值

          a         b         c         d
a -0.724929  0.677077 -0.791651 -0.316745
b -0.001068 -0.127204 -0.681314  3.231601
c -0.333347 -1.496209 -0.374502  1.307881
d -0.481801  0.137053  0.979537 -1.947072
e -0.936224  0.529582 -0.015198  0.056315
--------------------------------------------------
          a         b
a -0.724929  0.677077
b -0.001068 -0.127204
--------------------------------------------------
          a         c
a -0.724929 -0.791651
c -0.333347 -0.374502
--------------------------------------------------
-0.7249290800404431


In [18]:
#没有设置行和列索引的DataFrame，iloc和loc的区别
df_obj2 = pd.DataFrame(np.random.randn(5,4))
print(df_obj2)
print('-'*50)
print(df_obj2.iloc[0:2]) #位置索引，左闭右开，可取2行
print('-'*50)
print(df_obj2.loc[0:2]) #标签索引，左闭右闭，可取3行

          0         1         2         3
0  0.135293 -0.463403 -1.008775 -0.641482
1  1.120191  0.659343 -0.378615 -0.237859
2 -1.170630  0.297060 -0.924234  1.359205
3 -0.476766 -0.583614 -0.296421 -0.822441
4 -0.773998  1.901052 -2.116670  0.929598
--------------------------------------------------
          0         1         2         3
0  0.135293 -0.463403 -1.008775 -0.641482
1  1.120191  0.659343 -0.378615 -0.237859
--------------------------------------------------
          0         1         2         3
0  0.135293 -0.463403 -1.008775 -0.641482
1  1.120191  0.659343 -0.378615 -0.237859
2 -1.170630  0.297060 -0.924234  1.359205


## 对齐运算

In [19]:
import pandas as pd
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))
# Series 对齐运算，缺失数据默认是NaN
print('s1+s2: ')
s3=s1+s2
print(s3)

s1+s2: 
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [22]:
print(np.isnan(s3[6]))
print('-'*50)
print(s2.add(s1, fill_value = 0))  #未对齐的数据将和填充值做运算，谁缺填谁
print(s2.sub(s1, fill_value = 0))

True
--------------------------------------------------
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    15.0
6    16.0
7    17.0
8    18.0
9    19.0
dtype: float64
0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5   -15.0
6   -16.0
7   -17.0
8   -18.0
9   -19.0
dtype: float64


In [23]:
# df对齐运算
import numpy as np
df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])
print(df1)
print(df2)
print('-'*50)
print(df2.dtypes)
print(df1-df2)
print(df2.sub(df1, fill_value = 2)) #未对齐的数据将和填充值做运算，谁缺填谁

     a    b
0  1.0  1.0
1  1.0  1.0
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0
--------------------------------------------------
a    float64
b    float64
c    float64
dtype: object
     a    b   c
0  0.0  0.0 NaN
1  0.0  0.0 NaN
2  NaN  NaN NaN
     a    b    c
0  0.0  0.0 -1.0
1  0.0  0.0 -1.0
2 -1.0 -1.0 -1.0


总结：没对齐的元素，默认填充NaN，对齐运算时，fill_value参数可以指定填充值。