#### Unlabeled span-f1

In [50]:
def readBIO(path):
    ents = []
    curEnts = []
    for line in open(path, encoding = "utf-8"):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[2][0])
    return ents

def toSpans(tags):
    spans = set()
    for beg in range(len(tags)):
        if tags[beg][0] == 'B':
            end = beg
            for end in range(beg+1, len(tags)):
                if tags[end][0] != 'I':
                    break
            spans.add(str(beg) + '-' + str(end))
    #print(spans)
    return spans

def getInstanceScores(predPath, goldPath):
    goldEnts = readBIO(goldPath)
    predEnts = readBIO(predPath)
    entScores = []
    tp = 0
    fp = 0
    fn = 0
    for goldEnt, predEnt in zip(goldEnts, predEnts):
        goldSpans = toSpans(goldEnt)
        #print("goldSpans", goldSpans)
        predSpans = toSpans(predEnt)
        #print("predSpans", predSpans)
        overlap = len(goldSpans.intersection(predSpans))
        #print("n_overlap", overlap)
        #print("overlap", goldSpans.intersection(predSpans))
        tp += overlap
        fp += len(predSpans) - overlap
        fn += len(goldSpans) - overlap
        
    prec = 0.0 if tp+fp == 0 else tp/(tp+fp)
    rec = 0.0 if tp+fn == 0 else tp/(tp+fn)
    f1 = 0.0 if prec+rec == 0.0 else 2 * (prec * rec) / (prec + rec)
    return f1

In [51]:
true = readBIO("data/en_ewt-ud-test.iob2")
preds = readBIO("data/english_on_en_predictions")
getInstanceScores( "data/english_on_en_predictions", "data/en_ewt-ud-test.iob2")

0.832468145351581

##### Testing all the unlabeled span f1 scores

In [55]:
model_language = "english"



golden_files = ["data/zh_pud-ud-test.iob2",
                   "data/da_ddt-ud-test.iob2",
                   "data/en_ewt-ud-test.iob2",
                   "data/de_pud-ud-test.iob2",
                   "data/pt_pud-ud-test.iob2",
                   "data/ru_pud-ud-test.iob2",
                   "data/sk_snk-ud-test.iob2",
                   "data/sv_pud-ud-test.iob2",
                   "data/sr_test.iob2",
                   "data/hr_test.iob2"

                ]

predictions = [f"data/{model_language}_on_zh_predictions",
               f"data/{model_language}_on_da_predictions",
               f"data/{model_language}_on_en_predictions",
               f"data/{model_language}_on_de_predictions",
               f"data/{model_language}_on_pt_predictions",
               f"data/{model_language}_on_ru_predictions",
               f"data/{model_language}_on_sk_predictions",
               f"data/{model_language}_on_sv_predictions",
               f"data/{model_language}_on_sr_predictions",
               f"data/{model_language}_on_hr_predictions"
               ]

span = "unlabel_span_f1.py"

In [57]:
for golden, preds in zip(golden_files, predictions):
    # Run the span script with the corresponding arguments
    f1 = getInstanceScores(preds, golden)
    
    # Print the result
    print(f"Result for {preds}:")
    print("Output:", f1, "\n")

Result for data/english_on_zh_predictions:
Output: 0.7299726455646737 

Result for data/english_on_da_predictions:
Output: 0.8313120176405734 

Result for data/english_on_en_predictions:
Output: 0.832468145351581 

Result for data/english_on_de_predictions:
Output: 0.8264308980921359 

Result for data/english_on_pt_predictions:
Output: 0.8531656937584193 

Result for data/english_on_ru_predictions:
Output: 0.8034894398530761 

Result for data/english_on_sk_predictions:
Output: 0.7027804410354745 

Result for data/english_on_sv_predictions:
Output: 0.8782567503552817 

Result for data/english_on_sr_predictions:
Output: 0.8187772925764193 

Result for data/english_on_hr_predictions:
Output: 0.8033909357678514 



#### Training size 

In [3]:
from datasets import Dataset
import numpy as np
import pandas as pd
def read_iob2_file(path):
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    if current_tags != []:
        data.append((current_words, current_tags))

    df = pd.DataFrame(data, columns=['words', 'tags'])
    df['id'] = df.index
    df = df[['id', 'words', 'tags']]
    
    return df

In [5]:
train_data_eng = read_iob2_file("data\en_ewt-ud-train.iob2")
train_data_dan = read_iob2_file("data\da_ddt-ud-train.iob2")
train_data_cro = read_iob2_file("data\hr_train.iob2")
train_data_slo = read_iob2_file("data\sk_snk-ud-train.iob2")
train_data_ser = read_iob2_file("data\sr_train.iob2")
eng_size = train_data_eng.shape
dan_size = train_data_dan.shape
cro_size = train_data_cro.shape
slo_size = train_data_slo.shape
ser_size = train_data_ser.shape


(12543, 3)

In [None]:
print("size of english:", train_data_eng.shape) 

