# RuleBased Classification of QA dataset
Load in libs, dataframe and examine answerable=true proportion

In [None]:
import polars as pl
import re
from typing import Any
from data.const import ARB_CACHE, KOR_CACHE, TELU_CACHE

In [None]:
df_ar = pl.read_parquet(ARB_CACHE)
df_ko = pl.read_parquet(KOR_CACHE)
df_te = pl.read_parquet(TELU_CACHE)
df_arkote = pl.concat([df_ar, df_ko, df_te])
assert df_ar.height + df_ko.height + df_te.height == df_arkote.height; # sanity check
print(f"Answerable proportion: {df_arkote['answerable'].sum() / df_arkote.height:.2f}")

### Expression based approach to making rule based classification

In [None]:
def when_rule() -> pl.Expr:
    return pl.col("translation").str.to_lowercase().str.contains("when") & pl.col("context").str.contains(r"\d")

def rule_based_classification() -> pl.Expr:
    return when_rule() # | foo_rule() | bar_rule() | foo_bar_rule() etc.

In [None]:
# Add `answerable_pred` column based on rule-based classification
df_arkote = df_arkote.with_columns(
    rule_based_classification().alias("answerable_pred")
)
df_arkote.head()

### Old approach

In [None]:
def rule_based_classification(question: str, context: str, rules_list: list[Any]) -> bool:
    return bool(sum([rule(question, context) for rule in rules_list]))

def when_digit_rule(question: str, context: str) -> bool:
    return ("when" in question.lower() and bool(re.search(r'\d', context)))

In [None]:
rules = [when_digit_rule]
df_ar = df_ar.with_columns(
    pl.struct("translation", "context")
    .map_elements(lambda x: rule_based_classification(x["translation"], x["context"], rules), return_dtype=pl.Boolean)
    .alias("answerable_pred")
)
df_ar.head()