In [62]:
!pip install fuzzywuzzy



In [61]:
from fuzzywuzzy import fuzz

def load_entities(file_path):
    entities = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                # Extracting entity from the line
                entity = line.split(',')[0].split(':')[1].strip()
                entities.append((entity, 'O'))  # Assume 'O' label for all entities initially
    return entities

def bio_tagging(ner_entities, tfidf_entities):
    combined_entities = []
    for tfidf_entity, tfidf_score in tfidf_entities:
        found = False
        best_match_score = 0
        best_match_label = None
        for ner_entity, ner_label in ner_entities:
            if tfidf_entity == ner_entity:  # Match based on string representation
                found = True
                best_match_label = ner_label  # Use entity type from NER file
                break
        if found:
            if tfidf_score != "Not Found":
                try:
                    tfidf_score = float(tfidf_score)
                    if tfidf_score > 0.0:
                        combined_entities.append((tfidf_entity, "B-" + best_match_label))
                    else:
                        combined_entities.append((tfidf_entity, "O"))
                except ValueError:
                    combined_entities.append((tfidf_entity, "O"))  # Handle invalid numerical values
            else:
                combined_entities.append((tfidf_entity, "O"))
        else:
            combined_entities.append((tfidf_entity, "O"))
    return combined_entities



def write_biotagged_output(biotagged_entities, output_file):
    with open(output_file, 'w') as file:
        for entity, label in biotagged_entities:
            file.write(f"{entity}\t{label}\n")

def main():
    ner_file = "cleaned_datasets/NER_output_entities.txt"
    tfidf_file = "cleaned_datasets/TD_IDF_Analysis_Output.txt"
    output_file = "cleaned_datasets/bio_annotations.txt"

    ner_entities = load_entities(ner_file)
    tfidf_entities = load_entities(tfidf_file)

    biotagged_entities = bio_tagging(ner_entities, tfidf_entities)

    write_biotagged_output(biotagged_entities, output_file)

if __name__ == "__main__":
    main()
