In [18]:
import pandas as pd
import random
import json

fda_df = pd.read_csv('FDA_pills.csv')

fake_addresses = [
    "36th Floor, The Finance Center, 26th Street, BGC, Taguig", 
    "No. 66 United Street, Mandaluyong City, Metro Manila", 
    "UNILAB Pharma Campus, Barangay Mamplasan, Biñan Laguna, Philippines",
    "Unit 12A, Ayala Triangle Gardens Tower 2, Makati City",
    "15th Floor, Net Park Building, 5th Avenue, Bonifacio Global City",
    "789 Sampaloc Avenue, Manila",
    "22 Acacia Street, Quezon City",
    "Lot 10, Industrial Park, Calamba, Laguna",
    "Building 5, TechnoHub, UP Diliman, Quezon City",
    "456 Rizal Avenue Extension, Pasay City",
    "901 Ortigas Center, Pasig City",
    "123 Barangay Poblacion, San Juan",
    "Suite 8B, Rockwell Center, Makati",
    "Km. 25, South Superhighway, Muntinlupa",
    "Warehouse 7, Laguna Technopark, Santa Rosa",
    "111 Congressional Avenue, Quezon City",
    "202 Shaw Boulevard, Mandaluyong",
    "333 Alabang-Zapote Road, Las Piñas",
    "444 C5 Road, Taguig",
    "555 EDSA, Cubao, Quezon City",
    "666 Visayas Avenue, Quezon City",
    "777 Mindanao Avenue, Quezon City",
    "888 Aurora Boulevard, Quezon City",
    "999 España Boulevard, Manila",
    "1010 Taft Avenue, Manila",
    "1212 Roxas Boulevard, Manila",
    "1313 Makati Avenue, Makati City",
    "1414 Ayala Avenue, Makati City",
    "1515 Paseo de Roxas, Makati City",
    "1616 BGC High Street, Taguig",
    "1717 Eastwood City, Quezon City",
    "1818 Araneta Center, Quezon City",
    "1919 Greenbelt, Makati City"
]

# Sample warning texts
warnings = [
    "Do not accept if seal is broken",
    "Not recommended for pregnant women",
    "Keep out of reach of children",
    "Store in a cool, dry place away from sunlight"
]

new_warnings = [
    "consult your doctor before use.",
    "for external use only.",
    "discontinue use if irritation occurs.",
    "avoid contact with eyes.",
    "may cause drowsiness.",
    "do not use with other medications.",
    "keep away from direct sunlight.",
    "use only as directed.",
    "do not exceed recommended dosage.",
    "seek medical advice if symptoms persist.",
    "not for children under 12 years.",
    "may interact with alcohol.",
    "store at room temperature.",
    "shake well before using.",
    "do not freeze.",
    "may cause allergic reactions.",
    "read the label carefully.",
    "for professional use only.",
    "may impair ability to drive or operate machinery.",
    "do not use if pregnant or breastfeeding.",
    "report any side effects to your doctor.",
    "may cause skin sensitivity.",
    "do not crush or chew.",
    "swallow whole.",
    "may cause gastrointestinal upset.",
    "use with caution in elderly patients.",
    "may cause changes in blood pressure.",
    "do not use if you have a history of seizures.",
    "may increase risk of bleeding.",
    "may affect blood sugar levels."
]

warnings.extend([w.upper() if random.random() < 0.3 else w.lower() if random.random() < 0.3 else w.capitalize() for w in new_warnings])


In [23]:
# Define templates for synthetic OCR-like text
TEMPLATES = [
    "{warning} {generic} {brand} {dosage} {form} {classification} R Manufactured by {manufacturer}, {address}. {packaging} {importer}",
    "{generic} {brand} {dosage} {form} {classification} Manufactured by {manufacturer} at {address}. {packaging}. Imported by {importer}.",
    "{generic} {dosage} {form} {classification} {packaging}. Distributed by {distributor}, {address}.",
    "{warning} {generic} {brand} {dosage} {form} {packaging}. Distributor: {distributor}, {address}. Manufacturer: {manufacturer}.",
    "{brand}  {generic} {dosage} {form}. {classification}. {packaging}. {warning} Imported by {importer} at {address}."
]

# Generate synthetic NER data
TRAIN_DATA = []

for _, row in fda_df.iterrows():
    # Select random address and warning
    address = random.choice(fake_addresses)
    warning = random.choice(warnings)
    
    # Fill template with medication details
    sentence = random.choice(TEMPLATES).format(
        generic=row["Generic Name"],
        brand=row["Brand Name"],
        dosage=row["Dosage Strength"],
        form=row["Dosage Form"],
        classification=row["Classification"],
        packaging=row["Packaging"],
        manufacturer=row["Manufacturer"],
        importer=row["Importer"],
        distributor=row["Distributor"],
        address=address,
        warning=warning
    )
    
    

    # Convert all values to strings and replace NaNs with empty strings
    row = row.fillna("").astype(str)
    # Track used spans to prevent overlapping entities
    used_spans = set()

    # Define entity spans
    entities = []
    for col, label in zip(
        ["Generic Name", "Brand Name", "Dosage Strength", "Dosage Form", "Classification", "Packaging", "Manufacturer", "Importer", "Distributor"],
        ["GENERIC_NAME", "BRAND_NAME", "DOSAGE_STRENGTH", "DOSAGE_FORM", "CLASSIFICATION", "PACKAGING", "MANUFACTURER", "IMPORTER", "DISTRIBUTOR"]
    ):
        value = row[col]
        if value:  # Ensure the value is not empty
            start = sentence.find(value)
            end = start + len(value)
            
            # Check if the span overlaps with any existing ones
            if start != -1 and not any(s <= start < e or s < end <= e for s, e, _ in entities):
                entities.append((start, end, label))
                used_spans.add((start, end))  # Track this span

        
    # Append to training data
    TRAIN_DATA.append((sentence, {"entities": entities}))

# Save the dataset
with open("synthetic_ner_ocr.json", "w") as f:
    json.dump(TRAIN_DATA, f, indent=4)

# Print sample output
for sample in TRAIN_DATA[:5]:
    print(sample)


("Lactobacillus gasseri + Lactobacillus rhamnosus Ecovag Formulation: Each hard vaginal capsule contains: Lactobacillus gasseri EB01 strain........Min. 10^8 cfu Lactobacillus rhamnosus PB01 strain..........Min. 10^8 cfu Hard Vaginal Capsule Prescription Drug (RX) Manufactured by Deerland Priobiotics &amp; Enzymes A/S at 333 Alabang-Zapote Road, Las Piñas. Aluminum tube with polyethylene desiccant stopper x 10's (Box of 1's and 4's). Imported by Zeus Resources, Inc..", {'entities': [(0, 47, 'GENERIC_NAME'), (48, 54, 'BRAND_NAME'), (55, 217, 'DOSAGE_STRENGTH'), (218, 238, 'DOSAGE_FORM'), (239, 261, 'CLASSIFICATION'), (356, 433, 'PACKAGING'), (278, 316, 'MANUFACTURER'), (447, 467, 'IMPORTER')]})
("Not recommended for pregnant women Affinity-Purified Antibodies To Human Interferon Gamma Anaferon 3 mg Orodispersible Tablet Alu/PVC blister pack x 20's (Box of 20's). Distributor: Metro Drug, Inc., 666 Visayas Avenue, Quezon City. Manufacturer: “NPF” Materia Medica Holding.", {'entities': [(35

In [24]:
import spacy
from spacy.training.example import Example
import json

# Load synthetic data
with open("synthetic_ner_ocr.json", "r") as f:
    TRAIN_DATA = json.load(f)

# Load a blank model
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Add entity labels
for _, annotations in TRAIN_DATA:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

# Training loop
optimizer = nlp.begin_training()

for epoch in range(10):
    for text, annotations in TRAIN_DATA:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], drop=0.2)

# Save trained model
nlp.to_disk("medication_ner")




KeyboardInterrupt: 

In [None]:

# Test the model
nlp = spacy.load("medication_ner")
doc = nlp("Levothyroxine sodium 50 mcg Tablet is manufactured by Merck Inc. and is available in 100 Tablets packaging.")

for ent in doc.ents:
    print(ent.text, ent.label_)
