# daskのquery関数わかりにくすぎワロタ

* 数字の場合
* 文字列("ABC")の場合
* 数字だけど文字列型の場合、多分"123"と"012"で違ってくる

In [1]:
import pandas as pd
import dask.dataframe as dd
import dask

pd.options.display.notebook_repr_html = False  # jupyter notebook上での出力形式を制御するために書いています。無くても動きます。

In [2]:
# 動作環境の確認
print(pd.__version__)
print(dask.__version__)

1.1.2
2023.1.0


In [3]:
# https://linus-mk.hatenablog.com/entry/pandas-unique-integer-id から

In [4]:
df = pd.DataFrame({
    'name'    : ['Alice', 'Bob', 'Charlie', 'Charlie', 'Alice', 'Bob'],
    'item' : ['aaa', 'bbb', 'ccc', 'ddd', 'eee', 'fff'],
    'number'    : [3, 2, 4, 3, 2, 1],
    'id_code' : ['012', '123', '234', '123', '012', '345']
})

In [5]:
df

      name item  number id_code
0    Alice  aaa       3     012
1      Bob  bbb       2     123
2  Charlie  ccc       4     234
3  Charlie  ddd       3     123
4    Alice  eee       2     012
5      Bob  fff       1     345

In [6]:
df.dtypes

name       object
item       object
number      int64
id_code    object
dtype: object

In [7]:
ddf = dd.from_pandas(df)

ValueError: Exactly one of npartitions and chunksize must be specified.

In [8]:
ddf = dd.from_pandas(df, npartitions=1)
print(ddf)

Dask DataFrame Structure:
                 name    item number id_code
npartitions=1                               
0              object  object  int64  object
5                 ...     ...    ...     ...
Dask Name: from_pandas, 1 graph layer


In [9]:
ddf.compute()

      name item  number id_code
0    Alice  aaa       3     012
1      Bob  bbb       2     123
2  Charlie  ccc       4     234
3  Charlie  ddd       3     123
4    Alice  eee       2     012
5      Bob  fff       1     345

## 数値型のカラムの場合

In [10]:
# pandas 直接値を指定
df.query("number==2")

    name item  number id_code
1    Bob  bbb       2     123
4  Alice  eee       2     012

In [11]:
# dask 直接値を指定
ddf.query("number==2").compute()

    name item  number id_code
1    Bob  bbb       2     123
4  Alice  eee       2     012

In [12]:
# pandas 変数名を使用 @
num = 2
df.query(f"number==@num")

    name item  number id_code
1    Bob  bbb       2     123
4  Alice  eee       2     012

In [13]:
# dask 変数名を使用 f文字列、成功
num = 2
ddf.query(f"number=={num}").compute()

    name item  number id_code
1    Bob  bbb       2     123
4  Alice  eee       2     012

In [14]:
# pandas 変数名を使用 実はf文字列でも行ける
num = 2
df.query(f"number=={num}")

    name item  number id_code
1    Bob  bbb       2     123
4  Alice  eee       2     012

## 文字列型のカラムの場合

In [15]:
# pandas 直接値を指定
df.query("name=='Bob'")

  name item  number id_code
1  Bob  bbb       2     123
5  Bob  fff       1     345

In [16]:
# dask 直接値を指定
ddf.query("name=='Bob'").compute()

  name item  number id_code
1  Bob  bbb       2     123
5  Bob  fff       1     345

In [17]:
# pandas 変数名を使用 @
target = 'Bob'
df.query(f"name==@target")

  name item  number id_code
1  Bob  bbb       2     123
5  Bob  fff       1     345

In [18]:
# dask 変数名を使用 f文字列 失敗例
target = 'Bob'
ddf.query(f"name=={target}").compute()

ValueError: Metadata inference failed in `query`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
UndefinedVariableError("name 'Bob' is not defined")

Traceback:
---------
  File "/usr/local/lib/python3.8/site-packages/dask/dataframe/utils.py", line 195, in raise_on_meta_error
    yield
  File "/usr/local/lib/python3.8/site-packages/dask/dataframe/core.py", line 6571, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/usr/local/lib/python3.8/site-packages/dask/utils.py", line 1103, in __call__
    return getattr(__obj, self.method)(*args, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/frame.py", line 3340, in query
    res = self.eval(expr, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/frame.py", line 3470, in eval
    return _eval(expr, inplace=inplace, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/eval.py", line 341, in eval
    parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 787, in __init__
    self.terms = self.parse()
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 806, in parse
    return self._visitor.visit(self.expr)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 398, in visit
    return visitor(node, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 404, in visit_Module
    return self.visit(expr, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 398, in visit
    return visitor(node, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 407, in visit_Expr
    return self.visit(node.value, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 398, in visit
    return visitor(node, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 699, in visit_Compare
    return self.visit(binop)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 398, in visit
    return visitor(node, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 520, in visit_BinOp
    op, op_class, left, right = self._maybe_transform_eq_ne(node)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 441, in _maybe_transform_eq_ne
    right = self.visit(node.right, side="right")
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 398, in visit
    return visitor(node, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 533, in visit_Name
    return self.term_type(node.id, self.env, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/ops.py", line 84, in __init__
    self._value = self._resolve_name()
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/ops.py", line 101, in _resolve_name
    res = self.env.resolve(self.local_name, is_local=self.is_local)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/scope.py", line 204, in resolve
    raise UndefinedVariableError(key, is_local) from err


In [19]:
# pandas 変数名を使用 f文字列 失敗例
target = 'Bob'
df.query(f"name=={target}")

UndefinedVariableError: name 'Bob' is not defined

In [20]:
print(f"name=={target}")

name==Bob


In [21]:
# dask 変数名を使用 f文字列 成功例
target = 'Bob'
ddf.query(f"name=='{target}'").compute()

  name item  number id_code
1  Bob  bbb       2     123
5  Bob  fff       1     345

In [22]:
# pandas 変数名を使用 実はf文字列でも行ける
target = 'Bob'
df.query(f"name=='{target}'")

  name item  number id_code
1  Bob  bbb       2     123
5  Bob  fff       1     345

## 数字が入っている文字列型の場合

In [23]:
# pandas 直接値を指定
df.query("id_code=='123'")

      name item  number id_code
1      Bob  bbb       2     123
3  Charlie  ddd       3     123

In [24]:
# dask 直接値を指定
ddf.query("id_code=='123'").compute()

      name item  number id_code
1      Bob  bbb       2     123
3  Charlie  ddd       3     123

In [25]:
# pandas 変数名を使用 @
code = '123'
df.query(f"id_code==@code")

      name item  number id_code
1      Bob  bbb       2     123
3  Charlie  ddd       3     123

In [26]:
# dask 変数名を使用 f文字列 失敗例
code = '123'
ddf.query(f"id_code=={code}").compute()

Empty DataFrame
Columns: [name, item, number, id_code]
Index: []

query関数の結果は空のDataFrameになる。  
エラーが出るほうがまだハッキリ間違い箇所が分かる分だけ修正しやすいかもしれない……  
これもquery関数の中にある文字列を単独で表示させるとよく分かる。

In [27]:
print(f"id_code=={code}")

id_code==123


In [28]:
# pandasでは数字の先頭に０をつけてはいけない
x = 012

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (<ipython-input-28-d581e4a9bb8c>, line 2)

In [29]:
# dask 変数名を使用 f文字列 失敗例その２
code = '012'
ddf.query(f"id_code=={code}").compute()

ValueError: Metadata inference failed in `query`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
SyntaxError('invalid syntax', ('<unknown>', 1, 13, 'id_code ==0 12 \n'))

Traceback:
---------
  File "/usr/local/lib/python3.8/site-packages/dask/dataframe/utils.py", line 195, in raise_on_meta_error
    yield
  File "/usr/local/lib/python3.8/site-packages/dask/dataframe/core.py", line 6571, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/usr/local/lib/python3.8/site-packages/dask/utils.py", line 1103, in __call__
    return getattr(__obj, self.method)(*args, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/frame.py", line 3340, in query
    res = self.eval(expr, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/frame.py", line 3470, in eval
    return _eval(expr, inplace=inplace, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/eval.py", line 341, in eval
    parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 787, in __init__
    self.terms = self.parse()
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 806, in parse
    return self._visitor.visit(self.expr)
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 394, in visit
    raise e
  File "/usr/local/lib/python3.8/site-packages/pandas/core/computation/expr.py", line 390, in visit
    node = ast.fix_missing_locations(ast.parse(clean))
  File "/usr/local/Cellar/python@3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ast.py", line 47, in parse
    return compile(source, filename, mode, flags,


In [30]:
# dask 変数名を使用 f文字列 成功例
code = '123'
ddf.query(f"id_code=='{code}'").compute()
# '012' の場合も同様なので、省略する。

      name item  number id_code
1      Bob  bbb       2     123
3  Charlie  ddd       3     123