# RuleBased Classification of QA dataset
Load in libs, dataframe and examine answerable=true proportion

In [None]:
import polars as pl
from data.const import ARB_CACHE, KOR_CACHE, TELU_CACHE
from matplotlib import pyplot as plt

In [None]:
df_ar = pl.read_parquet(ARB_CACHE)
df_ko = pl.read_parquet(KOR_CACHE)
df_te = pl.read_parquet(TELU_CACHE)
df_arkote = pl.concat([df_ar, df_ko, df_te])
assert df_ar.height + df_ko.height + df_te.height == df_arkote.height; # sanity check
print(f"Answerable proportion: {df_arkote['answerable'].sum() / df_arkote.height:.2f}")

### Expression based approach to making rule based classification

In [None]:
def just_positive_rule() -> pl.Expr:
    # Guessing positive in all cases as a baseline
    return pl.lit(True)


def when_rule() -> pl.Expr:
    return pl.col("translation").str.to_lowercase().str.contains("when") & pl.col("context").str.contains(r"\d")

def although_rule() -> pl.Expr:
    return ~pl.col("context").str.to_lowercase().str.contains("although")

def does_rule() -> pl.Expr:
    return ~pl.col("translation").str.to_lowercase().str.contains("does")

def first_letter_rule() -> pl.Expr:
    return pl.col("translation").str.slice(0, 1).str.to_lowercase().is_in(["w", "h"])

def rule_based_classification() -> pl.Expr:
    return (although_rule() & does_rule()) | (when_rule() | first_letter_rule())

In [None]:
def show_performance(df_ :pl.DataFrame):
    # Proportion of answerable predictions
    print(f"Answerable proportion: {df_['answerable'].sum() / df_.height:.2f}")
    print(f"Predicted answerable proportion: {df_['answerable_pred'].sum() / df_.height:.2f}")

    # Evaluate the rule-based classification
    true_positives = df_.filter(pl.col("answerable") & pl.col("answerable_pred")).height 
    false_positives = df_.filter(~pl.col("answerable") & pl.col("answerable_pred")).height
    true_negatives = df_.filter(~pl.col("answerable") & ~pl.col("answerable_pred")).height
    false_negatives = df_.filter(pl.col("answerable") & ~pl.col("answerable_pred")).height
    print(f"True Positives: {true_positives} , False Positives: {false_positives}")
    print(f"True Negatives: {true_negatives} , False Negatives: {false_negatives}")

    # Accuracy, Precision, Recall, F1 Score
    accuracy = (true_positives + true_negatives) / df_.height
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1_score:.2f}")

    # Plot confusion matrix
    confusion_matrix = [[true_positives, false_negatives],
                        [false_positives, true_negatives]]
    plt.imshow(confusion_matrix)
    plt.colorbar()
    plt.xticks([0, 1], ['P', 'N'])
    plt.yticks([0, 1], ['P', 'N'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()

### Performance of rules based classification

In [None]:
# Performance of each individual rule
print("JUST POSITIVE RULE PERFORMANCE")
show_performance(df_arkote.with_columns(
    just_positive_rule().alias("answerable_pred")
))
print("ALTHOUGH RULE PERFORMANCE")
show_performance(df_arkote.with_columns(
    although_rule().alias("answerable_pred")
))
print("DOES RULE PERFORMANCE")
show_performance(df_arkote.with_columns(
    does_rule().alias("answerable_pred")
))
print("FIRST LETTER RULE PERFORMANCE")
show_performance(df_arkote.with_columns(
    first_letter_rule().alias("answerable_pred")
))
print("WHEN RULE PERFORMANCE")
show_performance(df_arkote.with_columns(
    when_rule().alias("answerable_pred")
))
print("COMBINED RULES PERFORMANCE")
show_performance(df_arkote.with_columns(
    rule_based_classification().alias("answerable_pred")
))

In [None]:
print("Combined dataset performance")
show_performance(df_arkote.with_columns(
    rule_based_classification().alias("answerable_pred")
))
print("Arabic dataset performance")
show_performance(df_ar.with_columns(
    rule_based_classification().alias("answerable_pred")
))
print("Korean dataset performance")
show_performance(df_ko.with_columns(
    rule_based_classification().alias("answerable_pred")
))
print("Telugu dataset performance")
show_performance(df_te.with_columns(
    rule_based_classification().alias("answerable_pred")
))