In [10]:
import pandas as pd
from IPython.display import display
import numpy as np

# pd.qcut()
```python
pd.qcut(
    x,
    q,
    labels=None,
    retbins=False,
    precision=3,
    duplicates='raise',
)
```

**Docstring**

基于分位数的离散函数，根据秩或样本分位数将变量离散到等长度的区间中，例如将 1000 个数值按 10 分位数离散，则对于每个数值将生成一个用于指明相应分位数的 Categorical 对象。该函数返回`out`和`bins`两个对象：
- `bins`只有在`retbins`为`True`时返回，当返回分类数据时，`bins`表示为类别，可能为区间或字符串或等等；
- 在`labels`为`False`时，`out`可能是`Categorical`、`Series`或整数数组，返回类型取决于输入，若输入是一个非`Categorical`的`Series`，则返回一个`category`类型的`Series`
在返回的Categorical中，那些超出边界的值默认为 NA

**Args**

- x: 1d ndarray or Series

- q: 整数或整数型类数组；分位数的个数，如 10 表示十分位数等；或为分位数数值组成的数组，如 [0, .25, .5, .75, 1.]

- labels: array或为`False`，作为生成的`bins`的标签；若为数组，则必须与生成的区间个数相等；若为假，则只返回`bins`的整数指示符；为真则会抛出异常

- retbins: bool, 是否返回`bins`和`labels`，常在`bins`被指明为标量时使用

- precision: int, 存储和显示`bins`和`labels`的精度

- duplicates: {'raise', 'drop'}, 当`bins`边界不唯一时，抛出 ValueError 或 drop non-uniques.



**Type**

function

### Examples

In [95]:
x = pd.DataFrame(np.random.randn(15, 6), columns=list("ABCDEF"))
x

Unnamed: 0,A,B,C,D,E,F
0,2.584777,-1.290384,-0.248667,0.836272,1.501872,0.2674
1,-1.167323,1.034637,-0.760066,0.938743,-0.622502,-0.624562
2,-0.715947,-0.61764,-0.520593,-0.551496,1.068541,0.265345
3,-0.404403,0.131049,-0.303876,0.392217,-1.226467,1.925887
4,-0.161279,-0.658885,0.102714,0.618697,0.302568,-0.0793
5,1.221462,-0.553506,0.808316,0.264278,-1.076214,0.864148
6,0.4312,-1.21507,1.109152,0.127649,1.251036,-0.828486
7,-0.133745,-0.461564,-0.76191,0.432395,-0.350802,-1.679404
8,1.088238,0.196809,-1.034181,1.732666,0.827203,-1.533395
9,0.142282,-0.142118,-0.676858,-0.156403,1.681078,-0.809715


In [None]:
y = pd.qcut(x=x["E"], q=10)
for j, yj in enumerate(y):
    print("the {}-th value in x, \nwhich is {}, \nis in the interval: {}\n".format(j, x[j], yj))
print()
print(y, end="\n\n")
y = pd.qcut(x=x["E"], q=4, labels=False)
print("labels=False时，只返回bins的整数指示符：", end=" ")
print(y, end="\n\n")

labels不同传参的效果

In [None]:
y = pd.qcut(x=x["E"], q=[0., 0.2, 0.5, 0.9, 1])
print(y, end="\n\n")
y = pd.qcut(x=x["E"], q=[0., 0.2, 0.5, 0.9, 1], labels=[4, 3, 2, 1])
print(y, end="\n\n")
y = pd.qcut(x=x["E"], q=[0., 0.2, 0.5, 0.9, 1], labels=False)
print(y, end="\n\n")

label仅仅是label，其排序按传参的列表排序设定，意义是人为赋予的

In [None]:
y = pd.qcut(x["E"], 4, labels=list("ydzk"))
print(y)

#  

#  

# pd.merge()
```python
pd.merge(
    left,
    right,
    how='inner',
    on=None,
    left_on=None,
    right_on=None,
    left_index=False,
    right_index=False,
    sort=False,
    suffixes=('_x', '_y'),
    copy=True,
    indicator=False,
    validate=None,
) -> 'DataFrame'
```
**Docstring**

使用数据库样式的连接合并 DataFrame 或被命名的 Series 对象


The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes will be ignored. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on.

**Args**

- left: DataFrame

- right: DataFrame or named Series

- how: 可以是`'left'`、`'right'`、`'outer'`、`'inner'`，合并的方式
    * left: 只使用`left`的键值，类似于 SQL 的 left outer join；保留键的顺序
    * right: 只使用`right`的键值，类似于 SQL 的 right outer join；保留键的顺序
    * outer: 使用`left`和`right`键的并集，类似于 SQL 的 full outer join；键以字典形式排序
    * inner: 使用`left`和`right`键的交集，类似于 SQL 的 inner join；保留`left`键的顺序

- on: label or list，要连接的 index level 或 columns 的名称，该名称必须在两个 DataFrame 里都能找到，默认为连接两个对象中所有列名相同的列；两对象中没有列名相同的列时，left_on 和 right_on 均为指明时会报错；，；如果`on`为`None`且不是在 index 上合并，则将默认返回两个 DataFrame 的列的交集

- left_on: 可以是label, list, array-like, 要连接到`left`的index或columns的名称

- right_on: 可以是label, list, array-like, 要连接到`right`的index或columns的名称

left_index : bool, default False
    Use the index from the left DataFrame as the join key(s). If it is a
    MultiIndex, the number of keys in the other DataFrame (either the index
    or a number of columns) must match the number of levels.
right_index : bool, default False
    Use the index from the right DataFrame as the join key. Same caveats as
    left_index.
- sort: 是否在返回的 DataFrame 中按字典顺序对键排序，若为False，连接键的顺序取决于连接类型，即由`how`决定

suffixes : tuple of (str, str), default ('_x', '_y')
    Suffix to apply to overlapping column names in the left and right
    side, respectively. To raise an exception on overlapping columns use
    (False, False).
copy : bool, default True
    If False, avoid copy if possible.
indicator : bool or str, default False
    If True, adds a column to output DataFrame called "_merge" with
    information on the source of each row.
    If string, column with information on source of each row will be added to
    output DataFrame, and column will be named value of string.
    Information column is Categorical-type and takes on a value of "left_only"
    for observations whose merge key only appears in 'left' DataFrame,
    "right_only" for observations whose merge key only appears in 'right'
    DataFrame, and "both" if the observation's merge key is found in both.

validate : str, optional
    If specified, checks if merge is of specified type.

    * "one_to_one" or "1:1": check if merge keys are unique in both
      left and right datasets.
    * "one_to_many" or "1:m": check if merge keys are unique in left
      dataset.
    * "many_to_one" or "m:1": check if merge keys are unique in right
      dataset.
    * "many_to_many" or "m:m": allowed, but does not result in checks.

    .. versionadded:: 0.21.0



**See Also**

- merge_ordered : Merge with optional filling/interpolation.

- merge_asof : Merge on nearest keys.

- DataFrame.join : Similar method using indices.


**Type**

function

### Examples

"key"列相同时

In [None]:
x = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
                  'l_value': [1, 2, 3, 5],
                  'A': ['A0', 'A1', 'A2', 'A3']})
y = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
                  'r_value': [5, 6, 7, 8], 
                  'C': ['C0', 'C1', 'C2', 'C3']})
z = pd.merge(left=x, right=y, how="left")
print("how=left:", z, sep="\n", end="\n\n")
z = pd.merge(left=x, right=y, how="right")
print("how=right:", z, sep="\n", end="\n\n")
z = pd.merge(left=x, right=y, how="outer")
print("how=outer:", z, sep="\n", end="\n\n")
z = pd.merge(left=x, right=y, how="inner")
print("how=inner:", z, sep="\n", end="\n\n")

"value"列相同时

In [None]:
x = pd.DataFrame({'l_key': ['foo', 'bar', 'baz', 'foo'],
                  'value': [1, 2, 3, 5],
                  'A': ['A0', 'A1', 'A2', 'A3']})
y = pd.DataFrame({'r_key': ['foo', 'bar', 'baz', 'foo'],
                  'value': [5, 6, 7, 8], 
                  'C': ['C0', 'C1', 'C2', 'C3']})
z = pd.merge(left=x, right=y, how="left")
print("how=left:", z, sep="\n", end="\n\n")
z = pd.merge(left=x, right=y, how="right")
print("how=right:", z, sep="\n", end="\n\n")
z = pd.merge(left=x, right=y, how="outer")
print("how=outer:", z, sep="\n", end="\n\n")
z = pd.merge(left=x, right=y, how="inner")
print("how=inner:", z, sep="\n", end="\n\n")

关于“on”

In [121]:
x = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
                  'value': [1, 2, 3, 5],
                  'A': ['A0', 'A1', 'A2', 'A3']})
y = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
                  'value': [5, 6, 7, 8], 
                  'C': ['C0', 'C1', 'C2', 'C3']})
z = pd.merge(left=x, right=y, how="left", on="key")
print(z, sep="\n", end="\n\n")
z = pd.merge(left=x, right=y, how="left", on="value")
print(z, sep="\n", end="\n\n")
z = pd.merge(left=x, right=y, how="left", on=["key", "value"])  # default
print(z, sep="\n", end="\n\n")

   key  value_x   A  value_y   C
0  foo        1  A0        5  C0
1  foo        1  A0        8  C3
2  bar        2  A1        6  C1
3  baz        3  A2        7  C2
4  foo        5  A3        5  C0
5  foo        5  A3        8  C3

  key_x  value   A key_y    C
0   foo      1  A0   NaN  NaN
1   bar      2  A1   NaN  NaN
2   baz      3  A2   NaN  NaN
3   foo      5  A3   foo   C0

   key  value   A    C
0  foo      1  A0  NaN
1  bar      2  A1  NaN
2  baz      3  A2  NaN
3  foo      5  A3   C0



left_on 和 right_on

In [120]:
x = pd.DataFrame({'l_key': ['foo', 'bar', 'baz', 'foo'],
                  'l_value': [1, 2, 3, 5],
                  'A': ['A0', 'A1', 'A2', 'A3']})
y = pd.DataFrame({'r_key': ['foo', 'bar', 'baz', 'foo'],
                  'r_value': [5, 6, 7, 8], 
                  'C': ['C0', 'C1', 'C2', 'C3']})
z = pd.merge(left=x, right=y, left_on="l_key", right_on="r_key")
print(z)

  l_key  l_value   A r_key  r_value   C
0   foo        1  A0   foo        5  C0
1   foo        1  A0   foo        8  C3
2   foo        5  A3   foo        5  C0
3   foo        5  A3   foo        8  C3
4   bar        2  A1   bar        6  C1
5   baz        3  A2   baz        7  C2


In [None]:
# Merge DataFrames df1 and df2 with specified left and right suffixes appended to any overlapping columns.
x.merge(y, left_on='lkey', right_on='rkey', suffixes=('_left', '_right'))

# Merge DataFrames df1 and df2, but raise an exception if the DataFrames have any overlapping columns.
x.merge(y, left_on='lkey', right_on='rkey', suffixes=(False, False))