In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 高级处理-数据离散化

## 读取股票的数据

In [2]:
p_change = pd.read_csv("../data/stock_day.csv", usecols=["p_change"])
p_change = p_change["p_change"]
p_change.head()

2018-02-27    2.68
2018-02-26    3.02
2018-02-23    2.42
2018-02-22    1.64
2018-02-14    2.05
Name: p_change, dtype: float64

## 将股票涨跌幅数据进行分组 qcut cut

### qcut 自行分组

In [3]:
# 自行分组
qcut = pd.qcut(p_change, 10)
qcut

2018-02-27    (1.738, 2.938]
2018-02-26     (2.938, 5.27]
2018-02-23    (1.738, 2.938]
2018-02-22     (0.94, 1.738]
2018-02-14    (1.738, 2.938]
                   ...      
2015-03-06     (5.27, 10.03]
2015-03-05    (1.738, 2.938]
2015-03-04     (0.94, 1.738]
2015-03-03     (0.94, 1.738]
2015-03-02    (1.738, 2.938]
Name: p_change, Length: 643, dtype: category
Categories (10, interval[float64, right]): [(-10.030999999999999, -4.836] < (-4.836, -2.444] < (-2.444, -1.352] < (-1.352, -0.462] ... (0.94, 1.738] < (1.738, 2.938] < (2.938, 5.27] < (5.27, 10.03]]

In [4]:
# 计算分到每个组数据个数
qcut.value_counts()

(-10.030999999999999, -4.836]    65
(-0.462, 0.26]                   65
(0.26, 0.94]                     65
(5.27, 10.03]                    65
(-4.836, -2.444]                 64
(-2.444, -1.352]                 64
(-1.352, -0.462]                 64
(1.738, 2.938]                   64
(2.938, 5.27]                    64
(0.94, 1.738]                    63
Name: p_change, dtype: int64

### cut 自定义区间

In [5]:
# 自己指定分组区间
bins = [-100, -7, -5, -3, 0, 3, 5, 7, 100]
cut = pd.cut(p_change, bins)
cut

2018-02-27      (0, 3]
2018-02-26      (3, 5]
2018-02-23      (0, 3]
2018-02-22      (0, 3]
2018-02-14      (0, 3]
                ...   
2015-03-06    (7, 100]
2015-03-05      (0, 3]
2015-03-04      (0, 3]
2015-03-03      (0, 3]
2015-03-02      (0, 3]
Name: p_change, Length: 643, dtype: category
Categories (8, interval[int64, right]): [(-100, -7] < (-7, -5] < (-5, -3] < (-3, 0] < (0, 3] < (3, 5] < (5, 7] < (7, 100]]

In [6]:
cut.value_counts()

(0, 3]        215
(-3, 0]       188
(3, 5]         57
(-5, -3]       51
(5, 7]         35
(7, 100]       35
(-100, -7]     34
(-7, -5]       28
Name: p_change, dtype: int64

# get_dummies 转换为one-hot编码

In [8]:
# 数据 前缀
pd.get_dummies(cut, prefix="rise")

Unnamed: 0,"rise_(-100, -7]","rise_(-7, -5]","rise_(-5, -3]","rise_(-3, 0]","rise_(0, 3]","rise_(3, 5]","rise_(5, 7]","rise_(7, 100]"
2018-02-27,0,0,0,0,1,0,0,0
2018-02-26,0,0,0,0,0,1,0,0
2018-02-23,0,0,0,0,1,0,0,0
2018-02-22,0,0,0,0,1,0,0,0
2018-02-14,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
2015-03-06,0,0,0,0,0,0,0,1
2015-03-05,0,0,0,0,1,0,0,0
2015-03-04,0,0,0,0,1,0,0,0
2015-03-03,0,0,0,0,1,0,0,0


In [19]:
data = np.array([[1., 2., 1.], ["OK", np.NaN, np.NaN]])

In [20]:
pd1 = pd.DataFrame(data.T, columns=["Room", "hasPeople"])
pd1

Unnamed: 0,Room,hasPeople
0,1.0,OK
1,2.0,
2,1.0,


In [21]:
pd.get_dummies(pd1, dummy_na=True)

Unnamed: 0,Room_1.0,Room_2.0,Room_nan,hasPeople_OK,hasPeople_nan,hasPeople_nan.1
0,1,0,0,1,0,0
1,0,1,0,0,1,0
2,1,0,0,0,1,0
