In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

###  也可以通过cut方法
pandas.cut使用总结

用途

pandas.cut用来把一组数据分割成离散的区间。比如有一组年龄数据，可以使用pandas.cut将年龄数据分割成不同的年龄段并打上标签。

`pd.cut(
    x,
    bins,
    right: bool = True,
    labels=None,
    retbins: bool = False,
    precision: int = 3,
    include_lowest: bool = False,
    duplicates: str = 'raise',
)`

参数含义

- x：被切分的类数组（array-like）数据，必须是1维的（不能用DataFrame）；
- bins：bins是被切割后的区间（或者叫“桶”、“箱”、“面元”），有3中形式：一个int型的标量、标量序列（数组）或者pandas.IntervalIndex 。

    一个int型的标量
    当bins为一个int型的标量时，代表将x平分成bins份。x的范围在每侧扩展0.1%，以包括x的最大值和最小值。
    标量序列
    标量序列定义了被分割后每一个bin的区间边缘，此时x没有扩展。
    pandas.IntervalIndex
    定义要使用的精确区间。

- right：bool型参数，默认为True，表示是否包含区间右部。比如如果bins=[1,2,3]，right=True，则区间为(1,2]，(2,3]；right=False，则区间为(1,2),(2,3)。
- labels：给分割后的bins打标签，比如把年龄x分割成年龄段bins后，可以给年龄段打上诸如青年、中年的标签。labels的长度必须和划分后的区间长度相等，比如bins=[1,2,3]，划分后有2个区间(1,2]，(2,3]，则labels的长度必须为2。如果指定labels=False，则返回x中的数据在第几个bin中（从0开始）。
- retbins：bool型的参数，表示是否将分割后的bins返回，当bins为一个int型的标量时比较有用，这样可以得到划分后的区间，默认为False。
- precision：保留区间小数点的位数，默认为3.
- include_lowest：bool型的参数，表示区间的左边是开还是闭的，默认为false，也就是不包含区间左部（闭）。
- duplicates：是否允许重复区间。有两种选择：raise：不允许，drop：允许

返回值

- out：一个pandas.Categorical, Series或者ndarray类型的值，代表分区后x中的每个值在哪个bin（区间）中，如果指定了labels，则返回对应的label。
- bins：分隔后的区间，当指定retbins为True时返回。

In [None]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) 
pd.cut(ages, 5)

可以看到ages被平分成5个区间，且区间两边都有扩展以包含最大值和最小值。

#### 将ages平分成5个区间并指定labels

In [None]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) #年龄数据
pd.cut(ages, 5, labels=[u"婴儿",u"青年",u"中年",u"壮年",u"老年"])

#### 给ages指定区间进行分割

In [None]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) #年龄数据
pd.cut(ages, [0,5,20,30,50,100])

In [None]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) #年龄数据
pd.cut(ages, [0,5,20,30,50,100], labels=[u"婴儿",u"青年",u"中年",u"壮年",u"老年"])

这里不再平分ages，而是将ages分为了5个区间(0, 5],(5, 20],(20, 30],(30,50],(50,100].

#### 返回分割后的bins
- 令retbins=True即可

In [58]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) #年龄数据
s = pd.cut(ages, [0,5,20,30,50,100], labels=[u"婴儿",u"青年",u"中年",u"壮年",u"老年"])
print(s)
print(type(s))

[婴儿, 婴儿, 青年, 壮年, 壮年, ..., 青年, 青年, 中年, 中年, 壮年]
Length: 16
Categories (5, object): [婴儿 < 青年 < 中年 < 壮年 < 老年]
<class 'pandas.core.arrays.categorical.Categorical'>


In [61]:
type(s)

pandas.core.arrays.categorical.Categorical

AttributeError: 'Categorical' object has no attribute 'cat'

AttributeError: 'Categorical' object has no attribute 'cat'

#### 只返回x中的数据在哪个bin
令labels=False即可

In [26]:
ages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32]) #年龄数据
pd.cut(ages, [0,5,20,30,50,100], retbins=True)[0].value_counts()

(0, 5]       2
(5, 20]      4
(20, 30]     2
(30, 50]     3
(50, 100]    5
dtype: int64

## 表格视觉样式：Dataframe.style

### 1,表格样式创建

表格视觉样式：Dataframe.style → 返回pandas.Styler对象的属性，具有格式化和显示Dataframe的有用方法

样式创建：

    ① Styler.applymap：→ 按元素方式处理Dataframe
    ② Styler.apply：column- / row- / table-wise → 按行/列处理Dataframe

In [28]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['a','b','c','d'])
sty = df.style
print(sty, type(sty)) # 查看样式类型

<pandas.io.formats.style.Styler object at 0x0000027A8E558A48> <class 'pandas.io.formats.style.Styler'>


#### 按元素处理样式 df.style.applymap( 函数 )

In [31]:
def color_neg_red(val):
    if val < 0:
        color = 'red'
        size ="14px"
    else:
        color = 'blue'
        size ="10px"
    return 'color:%s;font-size:%s'% (color,size)
# style="color:red;font-size:12px;"
df.style.applymap(color_neg_red)
# 创建样式方法，使得小于0的数变成红色
# style.applymap() → 自动调用其中的函数

Unnamed: 0,a,b,c,d
0,-1.34451,0.118692,0.516458,-0.291674
1,-1.962929,-1.793657,0.649999,1.074508
2,-2.141306,-0.55063,-1.335991,-0.795388
3,-0.789197,-2.176502,0.096434,-1.14282
4,-0.964955,-0.044893,-0.401712,0.363618
5,-1.445998,0.659053,0.361913,1.187481
6,-1.15963,-0.313681,0.183981,1.017073
7,-0.569298,1.330518,0.220771,0.371379
8,1.448819,-0.240279,-0.758573,1.200765
9,0.179508,-0.779373,-0.521244,0.358309


#### 按行/列处理样式 df.style.apply( 函数, axis=0按列, subset=['b','c']处理b、c列 )

In [32]:
df

Unnamed: 0,a,b,c,d
0,-1.34451,0.118692,0.516458,-0.291674
1,-1.962929,-1.793657,0.649999,1.074508
2,-2.141306,-0.55063,-1.335991,-0.795388
3,-0.789197,-2.176502,0.096434,-1.14282
4,-0.964955,-0.044893,-0.401712,0.363618
5,-1.445998,0.659053,0.361913,1.187481
6,-1.15963,-0.313681,0.183981,1.017073
7,-0.569298,1.330518,0.220771,0.371379
8,1.448819,-0.240279,-0.758573,1.200765
9,0.179508,-0.779373,-0.521244,0.358309


In [40]:
# 按行/列处理样式：style.apply()
def highlight_max(s):
    # s.max()=这列的最大值 
    is_max = s == s.max() # 如果单元格中的数据等于这一列中最大值,返回True,否则范围false
    lst = [] # 定义列表保存样式
    # 循环series中每一个元素
    for v in is_max:
        if v:
            lst.append(' color: red; line-height: 1.5;">')
        else:
            lst.append('')
    return lst
df.style.apply(highlight_max, axis=0, subset=['b', 'c']) # axis：0为列，1为行，默认为0；  # subset：索引
# 创建样式方法，每列最大值填充黄色

Unnamed: 0,a,b,c,d
0,-1.34451,0.118692,0.516458,-0.291674
1,-1.962929,-1.793657,0.649999,1.074508
2,-2.141306,-0.55063,-1.335991,-0.795388
3,-0.789197,-2.176502,0.096434,-1.14282
4,-0.964955,-0.044893,-0.401712,0.363618
5,-1.445998,0.659053,0.361913,1.187481
6,-1.15963,-0.313681,0.183981,1.017073
7,-0.569298,1.330518,0.220771,0.371379
8,1.448819,-0.240279,-0.758573,1.200765
9,0.179508,-0.779373,-0.521244,0.358309


### 2.表格显示控制

In [None]:
df.head().style.format("{:.2%}")

按照百分数显示

In [42]:
df = pd.DataFrame(np.random.randn(10,4),columns=['a','b','c','d'])
df 

Unnamed: 0,a,b,c,d
0,-0.182593,-0.296072,-0.392444,-1.239522
1,-1.264622,2.998861,0.441914,0.592867
2,-1.073607,-0.213873,1.027618,0.437034
3,-1.785293,1.102659,1.580576,0.821845
4,-0.772845,-0.191382,2.163628,-0.503238
5,0.091075,-0.550434,-0.683868,0.941302
6,0.530819,1.580821,-0.940211,2.07582
7,0.09764,-0.023569,-2.236411,-0.794992
8,1.202674,-1.211373,-0.844227,1.080723
9,-0.189341,-0.85219,2.630941,0.446207


In [51]:
df.style.format("{:.2%}")

Unnamed: 0,a,b,c,d
0,-18.26%,-29.61%,-39.24%,-123.95%
1,-126.46%,299.89%,44.19%,59.29%
2,-107.36%,-21.39%,102.76%,43.70%
3,-178.53%,110.27%,158.06%,82.18%
4,-77.28%,-19.14%,216.36%,-50.32%
5,9.11%,-55.04%,-68.39%,94.13%
6,53.08%,158.08%,-94.02%,207.58%
7,9.76%,-2.36%,-223.64%,-79.50%
8,120.27%,-121.14%,-84.42%,108.07%
9,-18.93%,-85.22%,263.09%,44.62%


In [49]:
'{:.2%}'.format(0.23124)

'23.12%'

In [50]:
'{:.2f}'.format(0.23124)

'0.23'

In [52]:
df.head().style.format("{:.4f}")

Unnamed: 0,a,b,c,d
0,-0.1826,-0.2961,-0.3924,-1.2395
1,-1.2646,2.9989,0.4419,0.5929
2,-1.0736,-0.2139,1.0276,0.437
3,-1.7853,1.1027,1.5806,0.8218
4,-0.7728,-0.1914,2.1636,-0.5032


In [53]:
# 显示正负数
df.head().style.format("{:+.2f}")

Unnamed: 0,a,b,c,d
0,-0.18,-0.3,-0.39,-1.24
1,-1.26,3.0,0.44,0.59
2,-1.07,-0.21,1.03,0.44
3,-1.79,1.1,1.58,0.82
4,-0.77,-0.19,2.16,-0.5


In [56]:
# 分列显示
df.head().style.format({'b':"{:.2%}", 'c':"{:+.3f}", 'd':"{:.3f}"})

Unnamed: 0,a,b,c,d
0,-0.182593,-29.61%,-0.392,-1.24
1,-1.264622,299.89%,0.442,0.593
2,-1.073607,-21.39%,1.028,0.437
3,-1.785293,110.27%,1.581,0.822
4,-0.772845,-19.14%,2.164,-0.503


### 3.表格样式调用

Styler内置样式调用

In [64]:
# 定位空值
df = pd.DataFrame(np.random.rand(5, 4), columns=list('ABCD'))
df['A'][2] = np.nan
df

Unnamed: 0,A,B,C,D
0,0.386606,0.261133,0.885586,0.219028
1,0.390901,0.725192,0.67687,0.992836
2,,0.970077,0.970746,0.691652
3,0.479818,0.674035,0.859419,0.042316
4,0.79733,0.385441,0.991158,0.575056


In [65]:
# 定位空值
df.style.highlight_null(null_color='red')

Unnamed: 0,A,B,C,D
0,0.386606,0.261133,0.885586,0.219028
1,0.390901,0.725192,0.67687,0.992836
2,,0.970077,0.970746,0.691652
3,0.479818,0.674035,0.859419,0.042316
4,0.79733,0.385441,0.991158,0.575056


#####  色彩映射
`df.style.background_gradient(cmap='Greens',axis =1,low=0,high=1)` 色彩映射

In [71]:
df = pd.DataFrame(np.random.rand(10,4),columns = list('ABCD'))
df.style.background_gradient(cmap='Blues',axis =1,low=0,high=1) # cmap：颜色; # axis：映射参考，0为行，1以列

Unnamed: 0,A,B,C,D
0,0.006126,0.688378,0.367284,0.642712
1,0.886896,0.777959,0.531775,0.656683
2,0.465746,0.963511,0.555139,0.015873
3,0.182589,0.326768,0.930785,0.093136
4,0.693966,0.529737,0.923591,0.644901
5,0.492548,0.982571,0.436458,0.002073
6,0.023555,0.659608,0.720614,0.84994
7,0.27203,0.918795,0.75758,0.558475
8,0.76955,0.851667,0.87108,0.166367
9,0.235141,0.948054,0.462106,0.763764


#### 条形图

In [72]:
df = pd.DataFrame(np.random.rand(10,4),columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,0.641352,0.952104,0.276578,0.003305
1,0.722114,0.085179,0.379404,0.809537
2,0.278028,0.0819,0.930282,0.307977
3,0.634725,0.391326,0.707263,0.890953
4,0.203488,0.380589,0.818588,0.838789
5,0.513325,0.812863,0.598572,0.717225
6,0.869808,0.117985,0.72491,0.469613
7,0.412615,0.575243,0.019652,0.744284
8,0.990913,0.593476,0.419247,0.355928
9,0.995951,0.973167,0.743884,0.763119


In [73]:
df.style.bar(subset=['A', 'B'], color='blue', width=100) # width：最长长度在格子的占比

Unnamed: 0,A,B,C,D
0,0.641352,0.952104,0.276578,0.003305
1,0.722114,0.085179,0.379404,0.809537
2,0.278028,0.0819,0.930282,0.307977
3,0.634725,0.391326,0.707263,0.890953
4,0.203488,0.380589,0.818588,0.838789
5,0.513325,0.812863,0.598572,0.717225
6,0.869808,0.117985,0.72491,0.469613
7,0.412615,0.575243,0.019652,0.744284
8,0.990913,0.593476,0.419247,0.355928
9,0.995951,0.973167,0.743884,0.763119


In [None]:
# 分段式构建样式
df = pd.DataFrame(np.random.rand(10,4),columns = list('ABCD'))
df['A'][[3,2]] = np.nan
df.style.bar(subset=['A', 'B'], color='#d65f5f', width=100).highlight_null(null_color='yellow')