来源于官方资料：[十分钟入门pandas](https://www.pypandas.cn/docs/getting_started/10min.html#%E7%94%9F%E6%88%90%E5%AF%B9%E8%B1%A1)

## 1. 创建数据集 

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 用值列表生成Serises，默认自动生成整数索引
s = pd.Series([1, 2, 3, np.nan, 4])
print(s)

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
dtype: float64


In [4]:
dates = pd.date_range('20210220', periods=6)
print(dates)

DatetimeIndex(['2021-02-20', '2021-02-21', '2021-02-22', '2021-02-23',
               '2021-02-24', '2021-02-25'],
              dtype='datetime64[ns]', freq='D')


In [6]:
# 使用上述的时间索引生成数据集
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

                   A         B         C         D
2021-02-20 -0.575321  0.641131  0.277875  0.167075
2021-02-21 -1.188967  0.516794  0.284324 -0.308114
2021-02-22 -0.245231 -0.242355 -2.834809 -0.026509
2021-02-23 -1.421828 -0.474671  0.953416 -0.907602
2021-02-24 -0.640503 -0.712421 -1.224029  1.860798
2021-02-25  0.815809  0.702502 -1.608049  1.129848


In [7]:
# 使用Series字典对象生成数据集
df2 = pd.DataFrame({'A': 1.,
   ...:                     'B': pd.Timestamp('20130102'),
   ...:                     'C': pd.Series(1, index=list(range(4)), dtype='float32'),
   ...:                     'D': np.array([3] * 4, dtype='int32'),
   ...:                     'E': pd.Categorical(["test", "train", "test", "train"]),
   ...:                     'F': 'foo'})
   ...:
print(df2)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


In [9]:
# 查看不同属性列的类型
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## 2. 查看对象

In [10]:
df.head()

Unnamed: 0,A,B,C,D
2021-02-20,-0.575321,0.641131,0.277875,0.167075
2021-02-21,-1.188967,0.516794,0.284324,-0.308114
2021-02-22,-0.245231,-0.242355,-2.834809,-0.026509
2021-02-23,-1.421828,-0.474671,0.953416,-0.907602
2021-02-24,-0.640503,-0.712421,-1.224029,1.860798


In [12]:
df.tail()

Unnamed: 0,A,B,C,D
2021-02-21,-1.188967,0.516794,0.284324,-0.308114
2021-02-22,-0.245231,-0.242355,-2.834809,-0.026509
2021-02-23,-1.421828,-0.474671,0.953416,-0.907602
2021-02-24,-0.640503,-0.712421,-1.224029,1.860798
2021-02-25,0.815809,0.702502,-1.608049,1.129848


In [13]:
df.index

DatetimeIndex(['2021-02-20', '2021-02-21', '2021-02-22', '2021-02-23',
               '2021-02-24', '2021-02-25'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [16]:
# 查找支持numpy的对象
df.to_numpy()
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [17]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.542674,0.07183,-0.691878,0.319249
std,0.791857,0.621654,1.436321,1.007462
min,-1.421828,-0.712421,-2.834809,-0.907602
25%,-1.051851,-0.416592,-1.512044,-0.237713
50%,-0.607912,0.137219,-0.473077,0.070283
75%,-0.327754,0.610046,0.282712,0.889155
max,0.815809,0.702502,0.953416,1.860798


In [21]:
# 按照轴进行排列
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2021-02-20,0.167075,0.277875,0.641131,-0.575321
2021-02-21,-0.308114,0.284324,0.516794,-1.188967
2021-02-22,-0.026509,-2.834809,-0.242355,-0.245231
2021-02-23,-0.907602,0.953416,-0.474671,-1.421828
2021-02-24,1.860798,-1.224029,-0.712421,-0.640503
2021-02-25,1.129848,-1.608049,0.702502,0.815809


In [22]:
# 按照值进行排列
df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2021-02-25,0.815809,0.702502,-1.608049,1.129848
2021-02-20,-0.575321,0.641131,0.277875,0.167075
2021-02-21,-1.188967,0.516794,0.284324,-0.308114
2021-02-22,-0.245231,-0.242355,-2.834809,-0.026509
2021-02-23,-1.421828,-0.474671,0.953416,-0.907602
2021-02-24,-0.640503,-0.712421,-1.224029,1.860798


## 3. 选择 

In [23]:
# 选择单列数据
df['A']

2021-02-20   -0.575321
2021-02-21   -1.188967
2021-02-22   -0.245231
2021-02-23   -1.421828
2021-02-24   -0.640503
2021-02-25    0.815809
Freq: D, Name: A, dtype: float64

In [25]:
# 使用[]进行切片
df[0:3]
df['20210220':'20210223']

Unnamed: 0,A,B,C,D
2021-02-20,-0.575321,0.641131,0.277875,0.167075
2021-02-21,-1.188967,0.516794,0.284324,-0.308114
2021-02-22,-0.245231,-0.242355,-2.834809,-0.026509
2021-02-23,-1.421828,-0.474671,0.953416,-0.907602


### 3.2 按标签访问

In [26]:
# 使用标签返回一行数据
df.loc[dates[0]]

A   -0.575321
B    0.641131
C    0.277875
D    0.167075
Name: 2021-02-20 00:00:00, dtype: float64

In [28]:
# 用标签选择多列多行数据
df.loc['20210220':'20210223', ['A','B']]

Unnamed: 0,A,B
2021-02-20,-0.575321,0.641131
2021-02-21,-1.188967,0.516794
2021-02-22,-0.245231,-0.242355
2021-02-23,-1.421828,-0.474671


In [30]:
# 提取标量值
df.loc[dates[0], 'A']

-0.5753212092336925

### 3.3 按照位置选择


In [31]:
# 类似于numpy切片
df.iloc[3:5, 1:3]

Unnamed: 0,B,C
2021-02-23,-0.474671,0.953416
2021-02-24,-0.712421,-1.224029


In [32]:
# 访问标量
df.iloc[1,1]

0.5167938669172373

### 3.4 bool索引

In [33]:
df[df.B > 0]

Unnamed: 0,A,B,C,D
2021-02-20,-0.575321,0.641131,0.277875,0.167075
2021-02-21,-1.188967,0.516794,0.284324,-0.308114
2021-02-25,0.815809,0.702502,-1.608049,1.129848


In [34]:
# 选择整个数据集中满足条件的数据
df[df > 0]

Unnamed: 0,A,B,C,D
2021-02-20,,0.641131,0.277875,0.167075
2021-02-21,,0.516794,0.284324,
2021-02-22,,,,
2021-02-23,,,0.953416,
2021-02-24,,,,1.860798
2021-02-25,0.815809,0.702502,,1.129848


In [38]:
# 使用isin进行筛选
df3 = df.copy()
df3['E'] = ['one', 'two', 'three', 'four', 'five', 'six']
df3[df3['E'].isin(['two', 'three'])]

Unnamed: 0,A,B,C,D,E
2021-02-21,-1.188967,0.516794,0.284324,-0.308114,two
2021-02-22,-0.245231,-0.242355,-2.834809,-0.026509,three


### 3.5 赋值

In [47]:
# 用索引自动对齐
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20210220', periods=6))
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2021-02-20,-0.575321,0.641131,0.277875,0.167075,1
2021-02-21,-1.188967,0.516794,0.284324,-0.308114,2
2021-02-22,-0.245231,-0.242355,-2.834809,-0.026509,3
2021-02-23,-1.421828,-0.474671,0.953416,-0.907602,4
2021-02-24,-0.640503,-0.712421,-1.224029,1.860798,5
2021-02-25,0.815809,0.702502,-1.608049,1.129848,6


In [49]:
df.loc[:, 'D'] = np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,F
2021-02-20,-0.575321,0.641131,0.277875,5,1
2021-02-21,-1.188967,0.516794,0.284324,5,2
2021-02-22,-0.245231,-0.242355,-2.834809,5,3
2021-02-23,-1.421828,-0.474671,0.953416,5,4
2021-02-24,-0.640503,-0.712421,-1.224029,5,5
2021-02-25,0.815809,0.702502,-1.608049,5,6


## 4. 缺失值

In [56]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2021-02-20,-0.575321,0.641131,0.277875,5,1,1.0
2021-02-21,-1.188967,0.516794,0.284324,5,2,1.0
2021-02-22,-0.245231,-0.242355,-2.834809,5,3,
2021-02-23,-1.421828,-0.474671,0.953416,5,4,


In [57]:
# 删除缺失数据行
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2021-02-20,-0.575321,0.641131,0.277875,5,1,1.0
2021-02-21,-1.188967,0.516794,0.284324,5,2,1.0


In [58]:
# 填充
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2021-02-20,-0.575321,0.641131,0.277875,5,1,1.0
2021-02-21,-1.188967,0.516794,0.284324,5,2,1.0
2021-02-22,-0.245231,-0.242355,-2.834809,5,3,5.0
2021-02-23,-1.421828,-0.474671,0.953416,5,4,5.0


## 5. 运算

### 5.1 统计

In [60]:
df['B'].mean()

0.07182990458251581

In [61]:
# 在行上求平均值
df.mean(1)

2021-02-20    1.268737
2021-02-21    1.322430
2021-02-22    0.935521
2021-02-23    1.611383
2021-02-24    1.484609
2021-02-25    2.182052
Freq: D, dtype: float64