In [62]:
# !pip install fuzzywuzzy



In [82]:
from fuzzywuzzy import fuzz

def load_ner_entities(file_path):
    entities = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                # Extracting entity from the line
                entity = line.split(': ')[1].split(',')[0].strip()
                type_value = line.split(': ')[2].strip()
                entities.append((entity, type_value)) 
    return entities

def load_TFIDF_entities(file_path):
    entities = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                parts = line.split(': ')
                if len(parts) >= 3:
                    entity = parts[1].split(',')[0].strip()
                    score_part = parts[2]
                    if "Not Found" in score_part:
                        score_value = "Not Found"
                    else:
                        score_value = float(score_part.split(': ')[-1].strip())
                    entities.append((entity, score_value))
    return entities



def bio_tagging(ner_entities, tfidf_entities):
    combined_entities = []
    
    for tfidf_entity, tfidf_score in tfidf_entities:
        # Resetting parameters for a new loop
        found = False
        best_match_score = 0
        best_match_label = None
        
        
        for ner_entity, ner_label in ner_entities:
            if tfidf_entity == ner_entity:  # Match based on string representation
                found = True
                best_match_label = ner_label  # Use entity type from NER file
                
                break

        if found:
            if tfidf_score != "Not Found":
                try:
                    tfidf_score = float(tfidf_score)
                    if tfidf_score > 0.0:
                        combined_entities.append((tfidf_entity, "B-" + best_match_label))
                    else:
                        combined_entities.append((tfidf_entity, "O"))
                except ValueError:
                    combined_entities.append((tfidf_entity, "O"))  # Handle invalid numerical values
            else:
                combined_entities.append((tfidf_entity, "O"))
        else:
            combined_entities.append((tfidf_entity, "O"))
    return combined_entities



def write_biotagged_output(biotagged_entities, output_file):
    with open(output_file, 'w') as file:
        for entity, label in biotagged_entities:
            file.write(f"{entity}\t{label}\n")



In [83]:
ner_file = "cleaned_datasets/NER_output_entities.txt"
tfidf_file = "cleaned_datasets/TD_IDF_Analysis_Output.txt"
output_file = "cleaned_datasets/bio_annotations.txt"

ner_entities = load_ner_entities(ner_file)
tfidf_entities = load_TFIDF_entities(tfidf_file)

In [84]:
print(ner_entities)

[('2023', 'CARDINAL'), ('marc cruz', 'PERSON'), ('abdul kalam', 'PERSON'), ('max gross joshua estrada', 'PERSON'), ('jason mar josh', 'PERSON'), ('ng ethan', 'PERSON'), ('sarkis gafayan rubayet', 'ORG'), ('david jackson', 'PERSON'), ('21 2023', 'DATE'), ('1', 'CARDINAL'), ('2 2 fall 2023', 'DATE'), ('3 21', 'CARDINAL'), ('4 23', 'CARDINAL'), ('august', 'DATE'), ('231', 'CARDINAL'), ('4 232', 'CARDINAL'), ('5 24', 'CARDINAL'), ('5 25', 'CARDINAL'), ('7 251', 'CARDINAL'), ('7 2511', 'DATE'), ('7 2511', 'DATE'), ('8 252', 'CARDINAL'), ('9 26', 'CARDINAL'), ('october', 'DATE'), ('261', 'CARDINAL'), ('10 262', 'CARDINAL'), ('12 263', 'CARDINAL'), ('october', 'DATE'), ('13 271', 'CARDINAL'), ('13', 'CARDINAL'), ('15', 'CARDINAL'), ('17 4', 'CARDINAL'), ('17 5', 'CARDINAL'), ('18 1', 'CARDINAL'), ('2024', 'CARDINAL'), ('2025', 'CARDINAL'), ('2024', 'DATE'), ('four', 'CARDINAL'), ('two', 'CARDINAL'), ('85', 'CARDINAL'), ('11', 'CARDINAL'), ('1', 'CARDINAL'), ('semicircle quarter', 'DATE'), ('t

In [85]:
print(tfidf_entities)

[('2023', 0.009765367732305392), ('marc cruz', 'Not Found'), ('abdul kalam', 'Not Found'), ('max gross joshua estrada', 'Not Found'), ('jason mar josh', 'Not Found'), ('ng ethan', 'Not Found'), ('sarkis gafayan rubayet', 'Not Found'), ('david jackson', 'Not Found'), ('21 2023', 'Not Found'), ('1', 'Not Found'), ('2 2 fall 2023', 'Not Found'), ('3 21', 'Not Found'), ('4 23', 'Not Found'), ('august', 0.017089393531534436), ('231', 0.004882683866152696), ('4 232', 'Not Found'), ('5 24', 'Not Found'), ('5 25', 'Not Found'), ('7 251', 'Not Found'), ('7 2511', 'Not Found'), ('7 2511', 'Not Found'), ('8 252', 'Not Found'), ('9 26', 'Not Found'), ('october', 0.021972077397687133), ('261', 0.004882683866152696), ('10 262', 'Not Found'), ('12 263', 'Not Found'), ('october', 0.021972077397687133), ('13 271', 'Not Found'), ('13', 0.009765367732305392), ('15', 0.002441341933076348), ('17 4', 'Not Found'), ('17 5', 'Not Found'), ('18 1', 'Not Found'), ('2024', 0.009765367732305392), ('2025', 0.00244

In [86]:
biotagged_entities = bio_tagging(ner_entities, tfidf_entities)

write_biotagged_output(biotagged_entities, output_file)