### 创建DataFrame

In [None]:
# 既有行索引，又有列索引，可以视为由Series组成的字典
# 以二维结构保存数据

#### 通过由等长列表或数组组成的字典创建

In [1]:
import pandas as pd
# 行索引和列索引如果不指定也默认是整数型索引
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [2]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [3]:
# 指定列索引(可用于排序)来创建frame
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [5]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                      'five', 'six'])
frame2 # 找不到的列置为NaN

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


#### 通过嵌套字典创建

In [None]:
# 外层字典的键作为列，内层键则作为行索引
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [25]:
# 如果指定了行索引，就不会按内层字典指定的顺序排列
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop, index=[2001, 2002, 2003])
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


#### 由Series组成的字典

In [27]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


### 属性

#### columns, index

In [7]:
frame2.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

In [8]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

#### name

In [11]:
frame2['year'].name # 列的名字已经自动设置好了

'year'

In [None]:
# 设置全局的名字
frame2.index.name = 'year'
frame2.columns.name = 'state'

In [None]:
frame2

state,year,state,pop
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


#### values

In [32]:
# 返回ndarray
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, nan]])

### 创建列

In [2]:
import numpy as np
frame2['debt'] = 16.5 # 赋值+访问未知列来添加一列标量
print(frame2)
frame2['debt'] = np.arange(6.)
print(frame2)

NameError: name 'frame2' is not defined

In [18]:
# 传入Series，会对行索引和列名都匹配的位置赋值，其他位置变为缺失值
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


### 删除列

In [19]:
# 通过del删除列
del frame2['debt']
frame2

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


### 操作

#### 转置

In [23]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


#### 重新索引

In [40]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
old_index = obj.index
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [45]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2.index is old_index)
obj2 # 不存在的索引会引入缺失值

False


a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [47]:
# 使用method option在重新索引时进行插值
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)
obj3.reindex(range(6), method='ffill')

0      blue
2    purple
4    yellow
dtype: object


0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [48]:
# Dataframe的重新索引
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [49]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2 

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [50]:
# 也可以对列进行
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


#### 丢弃轴上的项

In [53]:
# 通过索引数组来丢弃指定轴的项
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
print(obj)
new_obj = obj.drop(['d', 'c', 'a'])
print(new_obj)

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
b    1.0
e    4.0
dtype: float64


In [54]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [55]:
data.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [57]:
# 对于会修改大小或形状的函数，往往支持就地修改
data.drop(['two', 'four', 'three'], axis=1, inplace=True)

KeyError: "['two', 'four', 'three'] not found in axis"

In [58]:
data

Unnamed: 0,one
Ohio,0
Colorado,4
Utah,8
New York,12


#### 索引

##### Series索引

In [65]:
# Series的索引方式类似于NumPy
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj['c']) 
print(obj[2])
print(obj[2:4])
print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
print(obj[obj < 2])
print(obj['b':'c']) # 利用标签的切片会包含末端，这证明了每个横向或纵向Index在Pandas中都是有序的数组

2.0
2.0
c    2.0
d    3.0
dtype: float64
b    1.0
a    0.0
d    3.0
dtype: float64
b    1.0
d    3.0
dtype: float64
a    0.0
b    1.0
dtype: float64
b    1.0
c    2.0
dtype: float64


##### 访问行或列

In [None]:
# 访问列
# 和NumPy一样，通过索引返回的是视图
frame2.year # 只有列名存在才能这样写
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [67]:
frame2[['Ohio', 'Texas']]

Unnamed: 0,Ohio,Texas
a,0.0,1.0
b,,
c,3.0,4.0
d,6.0,7.0


In [None]:
frame2.loc['three'] # 通过名称获取行

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

##### 选取数据

In [71]:
# 选取的都是一行记录
# 通过切片选取
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data[:2] # 选取的是行

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
data[data['three'] > 5] # 按列的条件选取行

In [78]:
# 通过布尔型Dataframe选取单元格数据
print(data[data < 10])
# 可以对选取的数据进行赋值
data[data < 7] = 0
print(data)

          one  two  three  four
Ohio      0.0  0.0    0.0   0.0
Colorado  0.0  5.0    6.0   7.0
Utah      8.0  9.0    NaN   NaN
New York  NaN  NaN    NaN   NaN
          one  two  three  four
Ohio        0    0      0     0
Colorado    0    0      0     7
Utah        8    9     10    11
New York   12   13     14    15


In [None]:
# 使用loc进行选取
# loc只能通过Index里的轴标签进行索引
data.loc['Colorado', ['two', 'three']] = 233
print(data)
print(data.loc[:'Utah', 'two']) # 支持切片

           one   two  three  four
Ohio         0     0      0     0
Colorado     0   233    233     7
Utah      2334  2334     10  2334
New York    12    13     14    15
Ohio           0
Colorado     233
Utah        2334
Name: two, dtype: int64


In [None]:
# 使用iloc进行索引
# iloc只能通过整数索引(int index)进行选取
data.iloc[2, [3, 0, 1]] = 2334
print(data)
print(data.iloc[[2, 3], [1, 2, 3]])
print(data.iloc[:, :3][data.three > 5]) # 支持切片

           one   two  three  four
Ohio         0     0      0     0
Colorado     0   233    233     7
Utah      2334  2334     10  2334
New York    12    13     14    15
           two  three  four
Utah      2334     10  2334
New York    13     14    15
           one   two  three
Colorado     0   233    233
Utah      2334  2334     10
New York    12    13     14


In [None]:
# 根据行标签和列标签选取单元格(标量)
data.at['Utah', 'one']

2334

In [None]:
# 根据行整数和列整数选取单元格标量
data.iat[3, 3]

15

##### 算术运算规则

In [25]:
# 索引不同时计算会在行和列上产生并集
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
        index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
        index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [26]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [27]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [29]:
df1.add(df2, fill_value=0) # 将df2广播后的对应缺失值处填充为0

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


#### Series和DataFrame计算

In [40]:
# 满足NumPy广播机制
# 默认情况下，DataFrame和Series之间的算术运算会将Series的索引匹配到DataFrame的列，然后沿着行一直向下广播
# 如果某个索引值在DataFrame的列或Series的索引中找不到，则参与运算的两个对象就会被重新索引以形成并集
# 如果希望匹配行且在列上广播，则必须使用算术运算方法
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
    columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series3 = frame['d']

In [41]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [42]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [45]:
frame.sub(series3, axis='index') # 等价于axis=0

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


#### 函数应用

##### NumPy ufuncs

In [None]:
# 1. NumPy的ufuncs（元素级数组方法）可用于操作pandas对象

##### apply方法

In [47]:
# 2. 将函数应用到由各列或行所形成的一维数组上，用apply方法实现
# 许多最为常见的数组统计功能都被实现成DataFrame的方法，可通过apply实现等价结果，但没必要，直接调用函数就行
f = lambda x: x.max() - x.min()
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

b    1.079484
d    0.306982
e    1.229303
dtype: float64

In [48]:
frame

Unnamed: 0,b,d,e
Utah,-1.992809,-0.714365,-0.029195
Ohio,-2.203671,-0.407383,0.903982
Texas,-1.576672,-0.496367,0.550121
Oregon,-1.124187,-0.708804,1.200107


In [50]:
# 默认沿着行执行，也就是每列应用一次该函数
frame.apply(f)

Unnamed: 0,b,d,e
Utah,-0.992809,0.285635,0.970805
Ohio,-1.203671,0.592617,1.903982
Texas,-0.576672,0.503633,1.550121
Oregon,-0.124187,0.291196,2.200107


In [51]:
# 每行执行函数
frame.apply(f, axis='columns')

Utah      1.963613
Ohio      3.107653
Texas     2.126793
Oregon    2.324295
dtype: float64

##### applymap方法

In [54]:
frame

Unnamed: 0,b,d,e
Utah,-1.992809,-0.714365,-0.029195
Ohio,-2.203671,-0.407383,0.903982
Texas,-1.576672,-0.496367,0.550121
Oregon,-1.124187,-0.708804,1.200107


In [53]:
# applymap是pandas提供的元素级的映射函数，实现更加定制化的功能
# format = lambda x: '%.2f' % x
format = lambda x: round(x, 2)
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-1.99,-0.71,-0.03
Ohio,-2.2,-0.41,0.9
Texas,-1.58,-0.5,0.55
Oregon,-1.12,-0.71,1.2


#### 约简方法/描述性统计

##### 汇总统计函数

In [23]:
# DataFrame的统计函数都是基于没有缺失数据的假设而构建的
# NA值会自动被排除，除非整个切片（这里指的是行或列）都是NA。通过skipna选项可以禁用该功能
# level参数，如果轴是层次化索引，根据lvel分组约简
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
        [np.nan, np.nan], [0.75, -1.3]],
        index=['a', 'b', 'c', 'd'],
        columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [25]:
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

##### 间接统计

In [26]:
# 比如返回的是目标值的索引
df.idxmax()

one    b
two    d
dtype: object

##### 累计型

In [27]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


##### 一次性产生多个汇总数据

In [28]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [29]:
# 非数值型产生另外的汇总统计
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

##### 需要参数对的统计函数

In [30]:
df.corr()

Unnamed: 0,one,two
one,1.0,-1.0
two,-1.0,1.0


In [32]:
df.corrwith(df['one']) # 计算列或行跟另一个Series或DataFrame之间的相关系数

a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64

#### 排序

##### 索引排序

In [55]:
# sort_index()按字典序对行或列索引进行排序，返回新对象
# axis参数选择排序对象为列还是行索引
# ascending参数选择排序方向
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [56]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [57]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
            index=['three', 'one'],
            columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [62]:
frame.sort_index(axis=1)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [61]:
frame.sort_index(axis=0)

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


##### 值排序

In [64]:
# 缺失值默认放到末尾
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values(ascending=False)

2    7.0
0    4.0
5    2.0
4   -3.0
1    NaN
3    NaN
dtype: float64

In [3]:
# DataFrame的排序是根据某一列(行)或几列(行)进行排序
# 使用by参数指定值排序所依赖的行或列
# 如果指定多列(行)，只有前面排序好的列(行)有相同值时才能看出排序效果
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [3, 2, 1, 8]})
frame

Unnamed: 0,b,a,c
0,4,0,3
1,7,1,2
2,-3,0,1
3,2,1,8


In [77]:
frame.sort_values(by=['b', 'a'])

Unnamed: 0,b,a,c
2,-3,0,1
3,2,1,8
0,4,0,3
1,7,1,2


In [76]:
# 对一行数据进行排序
frame.sort_values(by=1, axis=1)

Unnamed: 0,a,c,b
0,0,3,4
1,1,2,7
2,0,1,-3
3,1,8,2


##### 排名

In [5]:
frame

Unnamed: 0,b,a,c
0,4,0,3
1,7,1,2
2,-3,0,1
3,2,1,8


In [4]:
# 默认采用平均排名(average)
frame.rank()

Unnamed: 0,b,a,c
0,3.0,1.5,3.0
1,4.0,3.5,2.0
2,1.0,1.5,1.0
3,2.0,3.5,4.0


In [6]:
# 根据原始数据中的出现顺序进行排名
frame.rank(method='first')

Unnamed: 0,b,a,c
0,3.0,1.0,3.0
1,4.0,3.0,2.0
2,1.0,2.0,1.0
3,2.0,4.0,4.0


In [14]:
# 按照降序进行排名，使用整个分组中的最大排名
frame.rank(ascending=False, method='max')

Unnamed: 0,b,a,c
0,2.0,4.0,2.0
1,1.0,2.0,3.0
2,4.0,4.0,4.0
3,3.0,2.0,1.0


In [13]:
# 使用整个分组中的最小排名
frame.rank(method='min')

Unnamed: 0,b,a,c
0,3.0,1.0,3.0
1,4.0,3.0,2.0
2,1.0,1.0,1.0
3,2.0,3.0,4.0


In [15]:
# 排名总是在组间增加1
frame.rank(method='dense')

Unnamed: 0,b,a,c
0,3.0,1.0,3.0
1,4.0,2.0,2.0
2,1.0,1.0,1.0
3,2.0,2.0,4.0


### 索引对象

In [33]:
# Index对象负责存储轴标签和其他元数据
# 不可变，所以可以共享
# 类似于一个固定大小的集合，但是可以包含重复标签
frame3.index

Int64Index([2001, 2002, 2003], dtype='int64')

In [34]:
labels = pd.Index(np.arange(3))
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [35]:
obj2.index is labels

True

In [36]:
'Ohio' in frame3.index

False

In [37]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

#### 重新索引

In [39]:
df1.reindex(columns=['b', 'd', 'e', 'f'], fill_value=0)

Unnamed: 0,b,d,e,f
Ohio,0.0,2.0,0,0
Texas,3.0,5.0,0,0
Colorado,6.0,8.0,0,0


#### 带有重复标签

In [20]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [18]:
# 查看是否唯一
obj.index.is_unique

False

In [22]:
# 索引的输出类型会根据标签是否有重复发生变化
print(obj['a'])
print(df.loc['a'])

a    0
a    1
dtype: int64
          0         1         2
a -0.043021  1.598169  0.982465
a  0.300834 -0.629436  0.442839
