# RuleBased Classification of QA dataset
Load in libs, dataframe and examine answerable=true proportion

In [None]:
import polars as pl
from data.const import ARB_CACHE, KOR_CACHE, TELU_CACHE
from matplotlib import pyplot as plt

from bert_utils import (
    display_results
)

In [None]:
df_ar = pl.read_parquet(ARB_CACHE)
df_ko = pl.read_parquet(KOR_CACHE)
df_te = pl.read_parquet(TELU_CACHE)
df_arkote = pl.concat([df_ar, df_ko, df_te])
assert df_ar.height + df_ko.height + df_te.height == df_arkote.height; # sanity check
print(f"Answerable proportion: {df_arkote['answerable'].sum() / df_arkote.height:.2f}")

### Expression based approach to making rule based classification

In [None]:
def just_positive_rule() -> pl.Expr:
    # Guessing positive in all cases as a baseline
    return pl.lit(True)

def when_rule() -> pl.Expr:
    return pl.col("translation").str.to_lowercase().str.contains("when") & pl.col("context").str.contains(r"\d")

def although_rule() -> pl.Expr:
    return ~pl.col("context").str.to_lowercase().str.contains("although")

def does_rule() -> pl.Expr:
    return ~pl.col("translation").str.to_lowercase().str.contains("does")

def first_letter_rule() -> pl.Expr:
    return pl.col("translation").str.slice(0, 1).str.to_lowercase().is_in(["w", "h"])

def rule_based_classification() -> pl.Expr:
    return (although_rule() & does_rule()) | (when_rule() | first_letter_rule())

### Performance of rules based classification

In [None]:
# Performance of each individual rule
print("JUST POSITIVE RULE PERFORMANCE")
df_just_positive_rule = df_arkote.with_columns(just_positive_rule().alias("answerable_pred"))
display_results(df_just_positive_rule['answerable_pred'], df_just_positive_rule['answerable'], ["N", "Y"], "Just Positive Rule Confusion Matrix")

print("ALTHOUGH RULE PERFORMANCE")
df_although_rule = df_arkote.with_columns(although_rule().alias("answerable_pred"))
display_results(df_although_rule['answerable_pred'], df_although_rule['answerable'], ["N", "Y"], "Although Rule Confusion Matrix")

print("DOES RULE PERFORMANCE")
df_does_rule = df_arkote.with_columns(does_rule().alias("answerable_pred"))
display_results(df_does_rule['answerable_pred'], df_does_rule['answerable'], ["N", "Y"], "Does Rule Confusion Matrix")

print("FIRST LETTER RULE PERFORMANCE")
df_first_letter_rule = df_arkote.with_columns(first_letter_rule().alias("answerable_pred"))
display_results(df_first_letter_rule['answerable_pred'], df_first_letter_rule['answerable'], ["N", "Y"], "First Letter Rule Confusion Matrix")

print("WHEN RULE PERFORMANCE")
df_when_rule = df_arkote.with_columns(when_rule().alias("answerable_pred"))
display_results(df_when_rule['answerable_pred'], df_when_rule['answerable'], ["N", "Y"], "When Rule Confusion Matrix")

print("COMBINED RULES PERFORMANCE")
df_combined_rules = df_arkote.with_columns(rule_based_classification().alias("answerable_pred"))
display_results(df_combined_rules['answerable_pred'], df_combined_rules['answerable'], ["N", "Y"], "Combined Rules Confusion Matrix")



In [None]:
print("Combined dataset performance")
df_combined_dataset = df_arkote.with_columns(rule_based_classification().alias("answerable_pred"))
display_results(df_combined_dataset['answerable_pred'], df_combined_dataset['answerable'], ["N", "Y"], "Combined Dataset Confusion Matrix")

print("Arabic dataset performance")
df_arabic = df_ar.with_columns(rule_based_classification().alias("answerable_pred"))
display_results(df_arabic['answerable_pred'], df_arabic['answerable'], ["N", "Y"], "Arabic Dataset Confusion Matrix")

print("Korean dataset performance")
df_korean = df_ko.with_columns(rule_based_classification().alias("answerable_pred"))
display_results(df_korean['answerable_pred'], df_korean['answerable'], ["N", "Y"], "Korean Dataset Confusion Matrix")

print("Telugu dataset performance")
df_telugu = df_te.with_columns(rule_based_classification().alias("answerable_pred"))
display_results(df_telugu['answerable_pred'], df_telugu['answerable'], ["N", "Y"], "Telugu Dataset Confusion Matrix")