In [18]:
import numpy as np
import pandas as pd

In [19]:
df = pd.read_excel('./data/demo_04.xlsx', sheet_name='Sheet1')
df.head()

Unnamed: 0,日期,销量
0,2021-01-01,6961
1,2021-01-02,2047
2,2021-01-03,4205
3,2021-01-05,8988
4,2021-01-06,6658


# 数据偏移

## shift函数
- 销量增长率 = (本期销量 - 上期销量) / 上期销量

In [20]:
tb = df.set_index('日期')
tb['上期销量'] = tb['销量'].shift(periods=1, freq='D')
# tb['销量增长率'] = (tb['销量'] - tb['上期销量']) / tb['上期销量']
tb.eval('销量增长率=(销量-上期销量)/上期销量', inplace=True)

In [21]:
tb['销量增长率'] = tb['销量增长率'].apply(
    lambda x: np.nan if pd.isnull(x) else '{:.2%}'.format(x))
tb

Unnamed: 0_level_0,销量,上期销量,销量增长率
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-01-01,6961,,
2021-01-02,2047,6961.0,-70.59%
2021-01-03,4205,2047.0,105.42%
2021-01-05,8988,,
2021-01-06,6658,8988.0,-25.92%
...,...,...,...
2021-06-26,5822,6863.0,-15.17%
2021-06-27,4317,5822.0,-25.85%
2021-06-28,4971,4317.0,15.15%
2021-06-29,3742,4971.0,-24.72%


## diff函数
- 偏移后计算差异

In [24]:
tb['diff'] = tb['销量'].diff(periods=1)
tb

Unnamed: 0_level_0,销量,上期销量,销量增长率,diff
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,6961,,,
2021-01-02,2047,6961.0,-70.59%,-4914.0
2021-01-03,4205,2047.0,105.42%,2158.0
2021-01-05,8988,,,4783.0
2021-01-06,6658,8988.0,-25.92%,-2330.0
...,...,...,...,...
2021-06-26,5822,6863.0,-15.17%,-1041.0
2021-06-27,4317,5822.0,-25.85%,-1505.0
2021-06-28,4971,4317.0,15.15%,654.0
2021-06-29,3742,4971.0,-24.72%,-1229.0


## pct_change函数
- 偏移后计算差异百分比

In [25]:
tb['pct_change'] = tb['销量'].pct_change(periods=1, freq='D')
tb

Unnamed: 0_level_0,销量,上期销量,销量增长率,diff,pct_change
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-01,6961,,,,
2021-01-02,2047,6961.0,-70.59%,-4914.0,-0.705933
2021-01-03,4205,2047.0,105.42%,2158.0,1.054226
2021-01-05,8988,,,4783.0,
2021-01-06,6658,8988.0,-25.92%,-2330.0,-0.259235
...,...,...,...,...,...
2021-06-26,5822,6863.0,-15.17%,-1041.0,-0.151683
2021-06-27,4317,5822.0,-25.85%,-1505.0,-0.258502
2021-06-28,4971,4317.0,15.15%,654.0,0.151494
2021-06-29,3742,4971.0,-24.72%,-1229.0,-0.247234


## rolling函数
- 滚动计算，计算近3天平均销量

In [28]:
tb['近3天平均销量'] = tb['销量'].rolling(window=3, min_periods=1).mean()
tb

Unnamed: 0_level_0,销量,上期销量,销量增长率,diff,pct_change,近3天平均销量
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-01,6961,,,,,6961.000000
2021-01-02,2047,6961.0,-70.59%,-4914.0,-0.705933,4504.000000
2021-01-03,4205,2047.0,105.42%,2158.0,1.054226,4404.333333
2021-01-05,8988,,,4783.0,,5080.000000
2021-01-06,6658,8988.0,-25.92%,-2330.0,-0.259235,6617.000000
...,...,...,...,...,...,...
2021-06-26,5822,6863.0,-15.17%,-1041.0,-0.151683,4787.333333
2021-06-27,4317,5822.0,-25.85%,-1505.0,-0.258502,5667.333333
2021-06-28,4971,4317.0,15.15%,654.0,0.151494,5036.666667
2021-06-29,3742,4971.0,-24.72%,-1229.0,-0.247234,4343.333333


# 数据切分

In [30]:
tb = pd.read_excel('./data/demo_04.xlsx', sheet_name='Sheet2')
tb.head()

Unnamed: 0,学号,成绩
0,A001,93
1,A002,35
2,A003,54
3,A004,76
4,A005,53


## cut函数

### 指定箱数，等距分箱
- 10组

In [33]:
# x, 分组字段
# bins, 分组数量
# right=False, 左闭右开
# right=True, 左开右闭
# labels, 分箱标签
tb['等距分箱'] = pd.cut(
    x=tb['成绩'],
    bins=10,
    right=False)
tb['等距分箱'].value_counts()

[36.4, 48.2)        17
[107.2, 119.118)    15
[1.0, 12.8)         11
[24.6, 36.4)        10
[71.8, 83.6)        10
[60.0, 71.8)         9
[95.4, 107.2)        9
[48.2, 60.0)         7
[12.8, 24.6)         6
[83.6, 95.4)         6
Name: 等距分箱, dtype: int64

### 指定区间分箱
- [0, 60, 80, 100, 120]

In [37]:
tb['指定区间分箱'] = pd.cut(
    x=tb['成绩'],
    bins=[0, 60, 80, 100, 120],
    right=False,
    labels=['不及格', '一般', '良好', '优秀'])
tb['指定区间分箱'].value_counts()

不及格    51
优秀     22
一般     17
良好     10
Name: 指定区间分箱, dtype: int64

## qcut函数

### 指定分位数，等频分箱
- 4组

In [39]:
# 默认左开右闭
tb['等频分箱'] = pd.qcut(
    x=tb['成绩'],
    q=4)
tb['等频分箱'].value_counts()
# tb['成绩'].quantile(0.25)

(0.999, 34.75]    25
(34.75, 55.5]     25
(55.5, 94.25]     25
(94.25, 119.0]    25
Name: 等频分箱, dtype: int64

### 指定分位数区间分箱
- [0, 0.3, 0.5, 0.75, 1]

In [46]:
tb['指定分位数区间分箱'] = pd.qcut(
    x=tb['成绩'],
    q=[0, 0.3, 0.5, 0.75, 1])
tb['指定分位数区间分箱'].value_counts()
# tb['成绩'].quantile(0.75)

(0.999, 38.7]     30
(55.5, 94.25]     25
(94.25, 119.0]    25
(38.7, 55.5]      20
Name: 指定分位数区间分箱, dtype: int64