In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

###  也可以通过cut方法
pandas.cut使用总结

用途

pandas.cut用来把一组数据分割成离散的区间。比如有一组年龄数据，可以使用pandas.cut将年龄数据分割成不同的年龄段并打上标签。

`pd.cut(
    x,
    bins,
    right: bool = True,
    labels=None,
    retbins: bool = False,
    precision: int = 3,
    include_lowest: bool = False,
    duplicates: str = 'raise',
)`

参数含义

- x：被切分的类数组（array-like）数据，必须是1维的（不能用DataFrame）；
- bins：bins是被切割后的区间（或者叫“桶”、“箱”、“面元”），有3中形式：一个int型的标量、标量序列（数组）或者pandas.IntervalIndex 。

    一个int型的标量
    当bins为一个int型的标量时，代表将x平分成bins份。x的范围在每侧扩展0.1%，以包括x的最大值和最小值。
    标量序列
    标量序列定义了被分割后每一个bin的区间边缘，此时x没有扩展。
    pandas.IntervalIndex
    定义要使用的精确区间。

- right：bool型参数，默认为True，表示是否包含区间右部。比如如果bins=[1,2,3]，right=True，则区间为(1,2]，(2,3]；right=False，则区间为(1,2),(2,3)。
- labels：给分割后的bins打标签，比如把年龄x分割成年龄段bins后，可以给年龄段打上诸如青年、中年的标签。labels的长度必须和划分后的区间长度相等，比如bins=[1,2,3]，划分后有2个区间(1,2]，(2,3]，则labels的长度必须为2。如果指定labels=False，则返回x中的数据在第几个bin中（从0开始）。
- retbins：bool型的参数，表示是否将分割后的bins返回，当bins为一个int型的标量时比较有用，这样可以得到划分后的区间，默认为False。
- precision：保留区间小数点的位数，默认为3.
- include_lowest：bool型的参数，表示区间的左边是开还是闭的，默认为false，也就是不包含区间左部（闭）。
- duplicates：是否允许重复区间。有两种选择：raise：不允许，drop：允许

返回值

- out：一个pandas.Categorical, Series或者ndarray类型的值，代表分区后x中的每个值在哪个bin（区间）中，如果指定了labels，则返回对应的label。
- bins：分隔后的区间，当指定retbins为True时返回。

In [3]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) 
pd.cut(ages, 5)

[(0.901, 20.8], (0.901, 20.8], (0.901, 20.8], (20.8, 40.6], (20.8, 40.6], ..., (0.901, 20.8], (0.901, 20.8], (20.8, 40.6], (20.8, 40.6], (20.8, 40.6]]
Length: 16
Categories (5, interval[float64]): [(0.901, 20.8] < (20.8, 40.6] < (40.6, 60.4] < (60.4, 80.2] < (80.2, 100.0]]

可以看到ages被平分成5个区间，且区间两边都有扩展以包含最大值和最小值。

#### 将ages平分成5个区间并指定labels

In [4]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) #年龄数据
pd.cut(ages, 5, labels=[u"婴儿",u"青年",u"中年",u"壮年",u"老年"])

[婴儿, 婴儿, 婴儿, 青年, 青年, ..., 婴儿, 婴儿, 青年, 青年, 青年]
Length: 16
Categories (5, object): [婴儿 < 青年 < 中年 < 壮年 < 老年]

#### 给ages指定区间进行分割

In [5]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) #年龄数据
pd.cut(ages, [0,5,20,30,50,100])

[(0, 5], (0, 5], (5, 20], (30, 50], (30, 50], ..., (5, 20], (5, 20], (20, 30], (20, 30], (30, 50]]
Length: 16
Categories (5, interval[int64]): [(0, 5] < (5, 20] < (20, 30] < (30, 50] < (50, 100]]

In [6]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) #年龄数据
pd.cut(ages, [0,5,20,30,50,100], labels=[u"婴儿",u"青年",u"中年",u"壮年",u"老年"])

[婴儿, 婴儿, 青年, 壮年, 壮年, ..., 青年, 青年, 中年, 中年, 壮年]
Length: 16
Categories (5, object): [婴儿 < 青年 < 中年 < 壮年 < 老年]

这里不再平分ages，而是将ages分为了5个区间(0, 5],(5, 20],(20, 30],(30,50],(50,100].

#### 返回分割后的bins
- 令retbins=True即可

In [7]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) #年龄数据
s = pd.cut(ages, [0,5,20,30,50,100], labels=[u"婴儿",u"青年",u"中年",u"壮年",u"老年"],retbins=True)
print(s)
print(type(s))

([婴儿, 婴儿, 青年, 壮年, 壮年, ..., 青年, 青年, 中年, 中年, 壮年]
Length: 16
Categories (5, object): [婴儿 < 青年 < 中年 < 壮年 < 老年], array([  0,   5,  20,  30,  50, 100]))
<class 'tuple'>


In [8]:
print(s[0].value_counts())
print(type(s[0]))

婴儿    2
青年    4
中年    2
壮年    3
老年    5
dtype: int64
<class 'pandas.core.arrays.categorical.Categorical'>


In [9]:
print(s[1])

[  0   5  20  30  50 100]


#### 只返回x中的数据在哪个bin
令labels=False即可

In [10]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) #年龄数据
pd.cut(ages, [0,5,20,30,50,100], labels=False)

array([0, 0, 1, 3, 3, 1, 4, 4, 4, 4, 4, 1, 1, 2, 2, 3], dtype=int64)

## 表格视觉样式：Dataframe.style

### 1,表格样式创建

表格视觉样式：Dataframe.style → 返回pandas.Styler对象的属性，具有格式化和显示Dataframe的有用方法

样式创建：

    ① Styler.applymap：→ 按元素方式处理Dataframe
    ② Styler.apply：column- / row- / table-wise → 按行/列处理Dataframe

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['a','b','c','d'])
sty = df.style
print(sty, type(sty)) # 查看样式类型

<pandas.io.formats.style.Styler object at 0x00000209AFCA3CC8> <class 'pandas.io.formats.style.Styler'>


#### 按元素处理样式 df.style.applymap( 函数 )

In [3]:
def color_neg_red(val):
    if val < 0:
        color = 'red'
    else:
        color = 'black'
    return ('color:%s'% color)
df.style.applymap(color_neg_red)
# 创建样式方法，使得小于0的数变成红色
# style.applymap() → 自动调用其中的函数

Unnamed: 0,a,b,c,d
0,0.447798,-0.534326,-0.614525,-0.538871
1,-0.612528,-1.982758,-1.255578,0.151178
2,0.386986,-1.344207,-0.659319,0.578583
3,0.867958,-0.7152,0.639464,-1.346157
4,-2.108436,0.528856,1.194276,-1.169152
5,-0.552108,0.117717,-1.118881,-2.651756
6,-0.395649,-0.367974,-0.981922,0.582689
7,0.14431,1.064642,-0.384505,-0.632541
8,-1.566459,-0.322027,0.972508,-0.080237
9,-1.680165,-0.278028,1.195218,-0.813649


#### 按行/列处理样式 df.style.apply( 函数, axis=0按列, subset=['b','c']处理b、c列 )

In [5]:
# 按行/列处理样式：style.apply()
def highlight_max(s):
    is_max = s == s.max()
    print(is_max)
    lst = []
    for v in is_max:
        if v:
            lst.append(' padding: 0px; color: rgb(128, 0, 0); line-height: 1.5 !important;">')
        else:
            lst.append('')
    return lst
df.style.apply(highlight_max, axis=0, subset=['b', 'c']) # axis：0为列，1为行，默认为0；  # subset：索引
# 创建样式方法，每列最大值填充黄色

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
8    False
9    False
Name: b, dtype: bool
0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9     True
Name: c, dtype: bool


Unnamed: 0,a,b,c,d
0,0.447798,-0.534326,-0.614525,-0.538871
1,-0.612528,-1.982758,-1.255578,0.151178
2,0.386986,-1.344207,-0.659319,0.578583
3,0.867958,-0.7152,0.639464,-1.346157
4,-2.108436,0.528856,1.194276,-1.169152
5,-0.552108,0.117717,-1.118881,-2.651756
6,-0.395649,-0.367974,-0.981922,0.582689
7,0.14431,1.064642,-0.384505,-0.632541
8,-1.566459,-0.322027,0.972508,-0.080237
9,-1.680165,-0.278028,1.195218,-0.813649


### 2.表格显示控制

In [8]:
df.head().style.format("{:.2%}")

Unnamed: 0,a,b,c,d
0,44.78%,-53.43%,-61.45%,-53.89%
1,-61.25%,-198.28%,-125.56%,15.12%
2,38.70%,-134.42%,-65.93%,57.86%
3,86.80%,-71.52%,63.95%,-134.62%
4,-210.84%,52.89%,119.43%,-116.92%


按照百分数显示

In [9]:
df = pd.DataFrame(np.random.randn(10,4),columns=['a','b','c','d'])
print(df.head())
df.head().style.format("{:.2%}")

          a         b         c         d
0 -0.340871 -0.825292  0.907506  0.805764
1  1.435370 -0.254078 -0.174431  0.060038
2  0.799676  0.324985  0.463950  1.922623
3 -0.460490  0.461384  0.218530  0.846301
4  1.138265 -0.530664  1.507698 -0.215005


Unnamed: 0,a,b,c,d
0,-34.09%,-82.53%,90.75%,80.58%
1,143.54%,-25.41%,-17.44%,6.00%
2,79.97%,32.50%,46.39%,192.26%
3,-46.05%,46.14%,21.85%,84.63%
4,113.83%,-53.07%,150.77%,-21.50%


In [10]:
df.head().style.format("{:.4f}")

Unnamed: 0,a,b,c,d
0,-0.3409,-0.8253,0.9075,0.8058
1,1.4354,-0.2541,-0.1744,0.06
2,0.7997,0.325,0.4639,1.9226
3,-0.4605,0.4614,0.2185,0.8463
4,1.1383,-0.5307,1.5077,-0.215


In [11]:
# 显示正负数
df.head().style.format("{:+.2f}")

Unnamed: 0,a,b,c,d
0,-0.34,-0.83,0.91,0.81
1,1.44,-0.25,-0.17,0.06
2,0.8,0.32,0.46,1.92
3,-0.46,0.46,0.22,0.85
4,1.14,-0.53,1.51,-0.22


In [12]:
# 分列显示

df.head().style.format({'b':"{:.2%}", 'c':"{:+.3f}", 'd':"{:.3f}"})

Unnamed: 0,a,b,c,d
0,-0.340871,-82.53%,0.908,0.806
1,1.43537,-25.41%,-0.174,0.06
2,0.799676,32.50%,0.464,1.923
3,-0.46049,46.14%,0.219,0.846
4,1.138265,-53.07%,1.508,-0.215


### 3.表格样式调用

Styler内置样式调用

In [15]:
# 定位空值
df = pd.DataFrame(np.random.rand(5, 4), columns=list('ABCD'))
df['A'][2] = np.nan

df.style.highlight_null(null_color='red')

Unnamed: 0,A,B,C,D
0,0.718534,0.897299,0.862468,0.148617
1,0.076916,0.731825,0.430534,0.745501
2,,0.106515,0.380902,0.724866
3,0.064327,0.345559,0.961766,0.080762
4,0.419344,0.875916,0.467781,0.027139


#####  色彩映射
`df.style.background_gradient(cmap='Greens',axis =1,low=0,high=1)` 色彩映射

In [16]:
df = pd.DataFrame(np.random.rand(10,4),columns = list('ABCD'))
df.style.background_gradient(cmap='Greens',axis =1,low=0,high=1) # cmap：颜色; # axis：映射参考，0为行，1以列

Unnamed: 0,A,B,C,D
0,0.454667,0.895418,0.522539,0.668088
1,0.915492,0.063815,0.129173,0.232624
2,0.31813,0.901456,0.529192,0.540301
3,0.520605,0.893578,0.873407,0.365717
4,0.985473,0.376337,0.555237,0.547585
5,0.138856,0.527383,0.863354,0.05481
6,0.878319,0.889584,0.069524,0.850255
7,0.368717,0.31761,0.180074,0.875139
8,0.304992,0.290734,0.614449,0.662185
9,0.17826,0.028863,0.307339,0.786306


#### 条形图

In [17]:
df = pd.DataFrame(np.random.rand(10,4),columns = list('ABCD'))
df.style.bar(subset=['A', 'B'], color='#d65f5f', width=100) # width：最长长度在格子的占比

Unnamed: 0,A,B,C,D
0,0.091162,0.659391,0.79389,0.453831
1,0.241611,0.774842,0.822867,0.250582
2,0.541411,0.551059,0.521213,0.529303
3,0.214128,0.274993,0.438851,0.158967
4,0.721968,0.987833,0.7708,0.077401
5,0.126808,0.599619,0.814085,0.454038
6,0.691255,0.626466,0.573361,0.629499
7,0.25534,0.376143,0.655323,0.574809
8,0.944275,0.564231,0.162931,0.559705
9,0.335346,0.219521,0.050439,0.731554


In [18]:
# 分段式构建样式
df = pd.DataFrame(np.random.rand(10,4),columns = list('ABCD'))
df['A'][[3,2]] = np.nan
df.style.bar(subset=['A', 'B'], color='#d65f5f', width=100).highlight_null(null_color='yellow')

Unnamed: 0,A,B,C,D
0,0.411905,0.965642,0.670273,0.085508
1,0.930053,0.538399,0.764947,0.219239
2,,0.805292,0.987287,0.285137
3,,0.473305,0.420893,0.677806
4,0.747552,0.854908,0.570274,0.730824
5,0.3811,0.405887,0.992994,0.771802
6,0.669553,0.403413,0.211431,0.096229
7,0.343851,0.890898,0.259756,0.585015
8,0.230246,0.903022,0.364885,0.290795
9,0.610535,0.700028,0.310637,0.062491
