In [6]:
import pandas as pd
from sklearn.metrics import f1_score
from tqdm import tqdm
from nltk.metrics import segmentation


In [7]:
df = pd.read_pickle("../results/predictions_test.pkl")
df.head()

Unnamed: 0,book_path,chapter_idx,paragraph_idx,labels,logit_0,logit_1
0,riehl_ovidhofe.json,0,1,1,-1.731206,0.472115
1,riehl_ovidhofe.json,0,2,1,-2.199892,1.085217
2,riehl_ovidhofe.json,0,3,1,0.018988,-0.3609
3,riehl_ovidhofe.json,0,4,1,-1.766792,1.003996
4,riehl_ovidhofe.json,0,5,1,-1.097036,0.462459


In [8]:
def add_predictions(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["prediction"] = pd.Series([True for _ in range(len(df))])
    for book in tqdm(df["book_path"].unique()):
        book_df = df[df["book_path"] == book]
        predicted_chapters = book_df.sort_values("logit_0", ascending=False)[
            : len(book_df[book_df["labels"] == 0])
        ]
        for row in predicted_chapters.iterrows():
            df.loc[row[0], "prediction"] = False  # chapter break is class 0

    return df

pred_df = add_predictions(df)
pred_df.head()

100%|██████████| 1188/1188 [01:55<00:00, 10.31it/s]


Unnamed: 0,book_path,chapter_idx,paragraph_idx,labels,logit_0,logit_1,prediction
0,riehl_ovidhofe.json,0,1,1,-1.731206,0.472115,True
1,riehl_ovidhofe.json,0,2,1,-2.199892,1.085217,True
2,riehl_ovidhofe.json,0,3,1,0.018988,-0.3609,True
3,riehl_ovidhofe.json,0,4,1,-1.766792,1.003996,True
4,riehl_ovidhofe.json,0,5,1,-1.097036,0.462459,True


In [9]:
def calc_f1_score(df: pd.DataFrame) -> float:
    # class 0 (chapter break) is relevant for us
    return f1_score(
        df["labels"].replace([1, 0], value=[False, True]),
        df["prediction"].replace([True, False], value=[False, True]),
    )


def calc_pk_wd(df: pd.DataFrame) -> tuple:
    # class 0 (no continuation) should denote chapter break, metrics assume 1
    labels = df["labels"].replace([1, 0], value=[0, 1])
    predictions = df["prediction"].replace([True, False], value=[0, 1])
    labels = "".join(map(str, labels))
    predictions = "".join(map(str, predictions))
    # k should be half average reference segment length
    # average paragraph count per chapter is ~54
    # median is 32
    k = int(round(len(labels) / (predictions.count("1") * 2.0)))
    pk = segmentation.pk(labels, predictions, k=k)
    windowdiff = segmentation.windowdiff(labels, predictions, k=k)
    return pk, windowdiff

def calculate_metrics(df: pd.DataFrame) -> pd.DataFrame:
    results = []
    for book in tqdm(df["book_path"].unique()):
        book_df = df[df["book_path"] == book]
        f1 = calc_f1_score(book_df)
        pk, windowdiff = calc_pk_wd(book_df)
        results.append({"book": book, "f1": f1, "pk": pk, "wd": windowdiff})
    return pd.DataFrame(results)

results_df = calculate_metrics(pred_df)
results_df.head()

100%|██████████| 1188/1188 [01:54<00:00, 10.37it/s]


Unnamed: 0,book,f1,pk,wd
0,riehl_ovidhofe.json,0.6,0.306818,0.439394
1,dumasalt_margot2.json,0.529412,0.321541,0.38558
2,liebernt_knutarne.json,0.5,0.21735,0.276285
3,wulffen_argobast.json,0.703704,0.199316,0.272883
4,verne_ferien2.json,0.857143,0.066841,0.100262


In [10]:
print("Micro-averaged F1: ", results_df["f1"].mean())
print("Micro-averaged PK: ", results_df["pk"].mean())
print("Micro-averaged WD: ", results_df["wd"].mean())

Micro-averaged F1:  0.48614517939809415
Micro-averaged PK:  0.2865683274295492
Micro-averaged WD:  0.34652628305454225
