# Polarsテクニック＆tips

In [2]:
import polars as pl
import polars.selectors as cs # dtypeによってcolumnを選択したいときに使う

## - 正規表現による不要な列の選定
次のようなDataframeがあり、  
全行の値が一つ以上の"0"または"00:00:00"となっているカラムを不要なカラムとして考える。  
この不要なカラムを正規表現（下記）を用いて選定する。  
```
df[col].str.contains(regex_pattern).all():
```

In [27]:
df = pl.DataFrame({
    "col1": [1, 2, 3, 4, 5, 6],
    "col2": ["000000", "000000", "000000", "000000", "000000", "000000"],
    "col3": ["000001", "000001", "000001", "000001", "000001", "000001"],
    "col4": ["0000000000", "0000000000", "0000000000", "0000000000", "0000000000", "0000000000"],
    "col5": ["00:00:00", "00:00:00", "00:00:00", "00:00:00", "00:00:00", "00:00:00"],
    "col6": ["000001", "000002", "000003", "000004", "000005", "000006"],
    "col7": ["00", "00", "00", "00", "00", "00"],
})

df

col1,col2,col3,col4,col5,col6,col7
i64,str,str,str,str,str,str
1,"""000000""","""000001""","""0000000000""","""00:00:00""","""000001""","""00"""
2,"""000000""","""000001""","""0000000000""","""00:00:00""","""000002""","""00"""
3,"""000000""","""000001""","""0000000000""","""00:00:00""","""000003""","""00"""
4,"""000000""","""000001""","""0000000000""","""00:00:00""","""000004""","""00"""
5,"""000000""","""000001""","""0000000000""","""00:00:00""","""000005""","""00"""
6,"""000000""","""000001""","""0000000000""","""00:00:00""","""000006""","""00"""


In [18]:
# 正規表現を定義。0または:が一つ以上末尾まで続く文字列にマッチするようにする。n回以上なら{n,}
pattern = "[0:]+$"

In [19]:
useless_cols = []

# 不要なカラムを抽出
for col in df.columns:
    # .all()はカラムの全値が条件にマッチするか判定し、Booleanを返す
    if df[col].str.contains(pattern).all():
        useless_cols.append(col)


In [21]:
# 補足：リスト内包表記で書くとすっきりかける
useless_cols = [col for col in df.columns if df[col].str.contains(pattern).all()]

In [20]:
useless_cols

['col2', 'col4', 'col5', 'col7']

想定通りのカラムが選定できた。

## - dtypeによるカラムの選択

In [3]:
df = pl.DataFrame({
    "col1": [1, 2, 3, 4, 5, 6],
    "col2": ["000000", "000000", "000000", "000000", "000000", "000000"],
    "col3": ["00", "00", "00", "00", "00", "00"],
    "col4": [pl.Null, pl.Null, pl.Null, pl.Null, pl.Null, pl.Null],
    "col5": [10, 20, 30, 40, 50, 60]
})

df

col1,col2,col3,col4,col5
i64,str,str,object,i64
1,"""000000""","""00""",Null,10
2,"""000000""","""00""",Null,20
3,"""000000""","""00""",Null,30
4,"""000000""","""00""",Null,40
5,"""000000""","""00""",Null,50
6,"""000000""","""00""",Null,60


In [4]:
# string型のカラムのみ抽出
str_cols = df.select(cs.string())
str_cols

col2,col3
str,str
"""000000""","""00"""
"""000000""","""00"""
"""000000""","""00"""
"""000000""","""00"""
"""000000""","""00"""
"""000000""","""00"""


## - Nullが含まれていると、value_countsやuniqueが出来ない

In [5]:
df.select(pl.col("col4"))

col4
object
Null
Null
Null
Null
Null
Null


In [9]:
df.select(pl.col("col").value_counts())

ColumnNotFoundError: col

In [10]:
df.select(pl.col("col").unique)

TypeError: cannot create expression literal for value of type method: <bound method Expr.unique of <Expr ['col("col")'] at 0x7F56B8FB4D90>>

Hint: Pass `allow_object=True` to accept any value and create a literal of type Object.

## - 全行が単一の値となっているカラムを抽出する関数

In [12]:
# サンプルデータの作成
df = pl.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "code": ["0000000", "0000000", "0000000", "0000000", "0000000"],
    "all_nulls": [None, None, None, None, None],
    "mixed": ["0000000", "0000000", None, "0000000", "0000000"]
})

df

id,code,all_nulls,mixed
i64,str,null,str
1,"""0000000""",,"""0000000"""
2,"""0000000""",,"""0000000"""
3,"""0000000""",,
4,"""0000000""",,"""0000000"""
5,"""0000000""",,"""0000000"""


In [14]:
# カラムが全行単一の値となっているか判定する関数
def is_single_value_column(column: pl.Series) -> bool:
    # すべての値がNullの場合はTrueを返す
    if column.null_count() == len(column):
        return True
    # ユニークな値の数を確認
    unique_values = column.drop_nulls().unique().len()
    return unique_values == 1


# 判定を行う
for col in df.columns:
    result = is_single_value_column(df[col])
    print(f"Column '{col}' has single value across all rows: {result}")



Column 'id' has single value across all rows: False
Column 'code' has single value across all rows: True
Column 'all_nulls' has single value across all rows: True
Column 'mixed' has single value across all rows: True
