In [23]:
def load_entities(file_path):
    entities = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                parts = line.split('\t')
                if len(parts) == 1:
                    entities.append((parts[0], 'O'))  # Assume 'O' label if no label is provided
                elif len(parts) == 2:
                    entities.append((parts[0], parts[1]))
                else:
                    print(f"Ignoring invalid line: {line}")
    return entities

def bio_tagging(ner_entities, tfidf_entities):
    combined_entities = []
    for tfidf_entity, _ in tfidf_entities:
        found = False
        best_match_score = 0
        best_match_label = None
        for ner_entity, ner_label in ner_entities:
            match_score = fuzz.partial_ratio(tfidf_entity, ner_entity)
            if match_score >= 50:  # Adjust similarity threshold as needed
                found = True
                if match_score > best_match_score:
                    best_match_score = match_score
                    best_match_label = ner_label
        if found:
            if tfidf_entity == ner_entity:
                combined_entities.append((tfidf_entity, "B-" + best_match_label))
            else:
                combined_entities.append((tfidf_entity, "I-" + best_match_label))
        else:
            combined_entities.append((tfidf_entity, "O"))
    return combined_entities



def write_biotagged_output(biotagged_entities, output_file):
    with open(output_file, 'w') as file:
        for entity, label in biotagged_entities:
            file.write(f"{entity}\t{label}\n")

def main():
    ner_file = "cleaned_datasets/NER_output_entities.txt"
    tfidf_file = "cleaned_datasets/TD_IDF_Analysis_Output.txt"
    output_file = "cleaned_datasets/bio_annotations.txt"

    ner_entities = load_entities(ner_file)
    tfidf_entities = load_entities(tfidf_file)

    biotagged_entities = bio_tagging(ner_entities, tfidf_entities)

    write_biotagged_output(biotagged_entities, output_file)

if __name__ == "__main__":
    main()
