# RuleBased Classification of QA dataset
Load in libs, dataframe and examine answerable=true proportion

In [None]:
import polars as pl
import re
from typing import Any
from data.const import ARB_CACHE, KOR_CACHE, TELU_CACHE
from matplotlib import pyplot as plt
import numpy as np

In [None]:
df_ar = pl.read_parquet(ARB_CACHE)
df_ko = pl.read_parquet(KOR_CACHE)
df_te = pl.read_parquet(TELU_CACHE)
df_arkote = pl.concat([df_ar, df_ko, df_te])
assert df_ar.height + df_ko.height + df_te.height == df_arkote.height; # sanity check
print(f"Answerable proportion: {df_arkote['answerable'].sum() / df_arkote.height:.2f}")

### Expression based approach to making rule based classification

In [None]:
def just_positive_rule() -> pl.Expr:
    # Guessing positive in all cases as a baseline
    return pl.lit(True)


def when_rule() -> pl.Expr:
    return pl.col("translation").str.to_lowercase().str.contains("when") & pl.col("context").str.contains(r"\d")

def although_rule() -> pl.Expr:
    return ~pl.col("context").str.to_lowercase().str.contains("although")

def does_rule() -> pl.Expr:
    return ~pl.col("translation").str.to_lowercase().str.contains("does")

def first_letter_rule() -> pl.Expr:
    return pl.col("translation").str.slice(0, 1).str.to_lowercase().is_in(["w", "h"])

def rule_based_classification() -> pl.Expr:
    return (although_rule() & does_rule()) | (when_rule() | first_letter_rule())

In [None]:
from sklearn.metrics import confusion_matrix

def show_performance(df_: pl.DataFrame, title: str = "Normalized Confusion Matrix (Row-wise)"):
    # Proportion of answerable predictions
    print(f"Answerable proportion: {df_['answerable'].sum() / df_.height:.2f}")
    print(f"Predicted answerable proportion: {df_['answerable_pred'].sum() / df_.height:.2f}")

    # Confusion matrix counts
    tp = df_.filter(pl.col("answerable") & pl.col("answerable_pred")).height 
    fp = df_.filter(~pl.col("answerable") & pl.col("answerable_pred")).height
    tn = df_.filter(~pl.col("answerable") & ~pl.col("answerable_pred")).height
    fn = df_.filter(pl.col("answerable") & ~pl.col("answerable_pred")).height

    y_true = df_["answerable"].to_list()
    y_pred = df_["answerable_pred"].to_list()

    print(f"True Positives: {tp} , False Positives: {fp}")
    print(f"True Negatives: {tn} , False Negatives: {fn}")

    # Accuracy, Precision, Recall, F1 Score
    accuracy = (tp + tn) / df_.height
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1_score:.2f}")

    # normlized confusion matrix
    cm = confusion_matrix(y_true, y_pred, normalize='true') # row normalization
    plt.imshow(cm, cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ['N', 'A'])
    plt.yticks(tick_marks, ['N', 'A'])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    # Add text annotations for each cell in the confusion matrix
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, f"{cm[i, j]:.2f}", ha="center", va="center", color="black")
    plt.show()


### Performance of rules based classification

In [None]:
# Performance of each individual rule
print("JUST POSITIVE RULE PERFORMANCE")
show_performance(df_arkote.with_columns(
    just_positive_rule().alias("answerable_pred")
), title="Just Positive Rule")
print("ALTHOUGH RULE PERFORMANCE")
show_performance(df_arkote.with_columns(
    although_rule().alias("answerable_pred")
), title="Although Rule")
print("DOES RULE PERFORMANCE")
show_performance(df_arkote.with_columns(
    does_rule().alias("answerable_pred")
), title="Does Rule")
print("FIRST LETTER RULE PERFORMANCE")
show_performance(df_arkote.with_columns(
    first_letter_rule().alias("answerable_pred")
), title="First Letter Rule")
print("WHEN RULE PERFORMANCE")
show_performance(df_arkote.with_columns(
    when_rule().alias("answerable_pred")
), title="When Rule")
print("COMBINED RULES PERFORMANCE")
show_performance(df_arkote.with_columns(
    rule_based_classification().alias("answerable_pred")
), title="Combined Rules")

In [None]:
print("Combined dataset performance")
show_performance(df_arkote.with_columns(
    rule_based_classification().alias("answerable_pred")
), title="Combined Dataset")
print("Arabic dataset performance")
show_performance(df_ar.with_columns(
    rule_based_classification().alias("answerable_pred")
), title="Arabic Dataset")
print("Korean dataset performance")
show_performance(df_ko.with_columns(
    rule_based_classification().alias("answerable_pred")
), title="Korean Dataset")
print("Telugu dataset performance")
show_performance(df_te.with_columns(
    rule_based_classification().alias("answerable_pred")
), title="Telugu Dataset")