In [3]:
import numpy as np
import pandas as pd


# 1. Series 数据选择方法

### 1. 将 Series 看做字典


In [4]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data


a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [5]:
# 类似python字典的取值方法
data['a']


0.25

In [6]:
# 判断索引值是否存在
'a' in data


True

In [7]:
# 获取索引
data.keys()


Index(['a', 'b', 'c', 'd'], dtype='object')

In [9]:
# 获取字典
list(data.items())


[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [10]:
# 添加新行，扩展数据
data['e'] = 1.25
data


a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

### 2. 将 Series 看做一维数组


In [18]:
# 显式索引切片
print(data['a':'c'])

# 隐式索引切片
print(data[:2])

# 判断掩码
print(data[(data > 0.3) & (data < 0.8)])

# 花哨索引
print(data[['a', 'e']])


a    0.25
b    0.50
c    0.75
dtype: float64
a    0.25
b    0.50
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.25
e    1.25
dtype: float64


### 3. 使用索引器
loc、iloc、ix

为了避免我们在取值时显式索引和隐式索引之间的混淆，我们使用所引起进行约束。

In [21]:
# loc只允许显式索引取值
data.loc['a'], data['a': 'c']

(0.25,
 a    0.25
 b    0.50
 c    0.75
 dtype: float64)

In [23]:
# iloc只允许隐式索引
data.iloc[1], data.iloc[1:3]

(0.5,
 b    0.50
 c    0.75
 dtype: float64)

# 2. DataFrame 数据选择方法
### 1. 将DataFrame看做字典

In [26]:
# 数据准备
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
population_dict = {'California': 38332521,
                                'Texas': 26448193,
                                'New York': 19651127,
                                'Florida': 19552860,
                                'Illinois': 12882135}
population = pd.Series(population_dict)

# 使用两个Series构建一个DataFrame
states = pd.DataFrame({'area':area,'population':population})
states

Unnamed: 0,area,population
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [27]:
# 通过列名取列数据
states['area'], states.area

(California    423967
 Texas         695662
 New York      141297
 Florida       170312
 Illinois      149995
 Name: area, dtype: int64,
 California    423967
 Texas         695662
 New York      141297
 Florida       170312
 Illinois      149995
 Name: area, dtype: int64)

In [29]:
# 添加列
states['density'] = states['population']/states['area']
states

Unnamed: 0,area,population,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


### 2. 看作二维数组

In [31]:
# 获取所有数据矩阵
states.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [32]:
# 数据转置
states.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
population,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [36]:
# 通过二维数组取值
states.values[0][1]

38332521.0

### 3. 使用索引器取值

In [39]:
# 隐式索引iloc，显式索引loc
states.iloc[:3,:2], states.loc[:'Illinois',:'population']

(              area  population
 California  423967    38332521
 Texas       695662    26448193
 New York    141297    19651127,
               area  population
 California  423967    38332521
 Texas       695662    26448193
 New York    141297    19651127
 Florida     170312    19552860
 Illinois    149995    12882135)