In [2]:
import re
import unicodedata

import numpy as np
import pandas as pd
from rapidfuzz import fuzz, process

pd.set_option('display.max_colwidth', None)  # Show full content of each cell
pd.set_option('display.expand_frame_repr', False)  # Avoid line wrapping

# medicine list

In [26]:
medicine_list = [
    # English Medicines
    "Cyclosporine", "Mebendazole", "Pilocarpine", "Calcium", "Hydrocortisone", "Memantine",
    "Hydroquinone", "Loratadine", "Guaifenesin", "Retinol", "Hydroxyzine", "Carbocisteine",
    "Montelukast", "Dexamethasone", "Niacinamide", "Cefdinir", "Albendazole", "Gabapentin",
    "Levetiracetam", "Zinc", "Chlorhexidine", "Diclofenac", "Prednisolone", "Botox",
    "Dextromethorphan", "Lidocaine", "Metronidazole", "Acetylcysteine", "Ciprofloxacin",
    "Clindamycin", "Probiotics", "Erythromycin", "Paracetamol", "Omeprazole",
    "Fluorometholone", "Azithromycin", "Nystatin", "Valproate", "Pramipexole",
    "Carbamazepine", "Tretinoin", "Ofloxacin", "Tobramycin", "Timolol", "Ropinirole",
    "Brimonidine", "Pregabalin", "Rivastigmine", "Amantadine", "Ranitidine",
    "Salbutamol", "Domperidone", "Vitamin", "Dorzolamide", "Cetirizine", "Cefixime",
    "Topiramate", "Iron", "Latanoprost", "Multivitamins", "Donepezil", "Hyaluronic",
    "Ibuprofen", "Ondansetron", "Mupirocin", "Amoxicillin", "Doxycycline","Carbachol", 
    "Oral Amoxicillin",
    
    # Arabic Medicines
    "حمض الجليكوليك", "حمض الساليسيليك", "الببتيدات",
    "أملاح", "إريثرومايسين", "باراسيتامول", "دوكسيسيكلين", "ميبيندازول",
    "الزنك", "كاربوسيستين", "رانيتيدين", "أوندانسيترون", "مونتيلوكاست",
    "كلورهيكسيدين", "ديكلوفيناك", "ليدوكايين", "بريجابالين", "أزيثروميسين",
    "فيتامين", "نياسيناميد", "غوايفينيسين", "فالبروات", "أموكسيسيلين",
    "هيدروكسيزين", "البوتوكس", "لاتانوبروست", "توبراميسين", "جابابنتين",
    "ديكساميثازون", "ديكستروميثورفان", "سيبروفلوكساسين", "ليفيتيراسيتام",
    "الكالسيوم", "تيمولول", "روبينيرول", "الهيالورونيك", "كاربامازيبين",
    "نيستاتين", "إيبوبروفين", "بريمونيدين", "ألبيندازول", "هيدروكورتيزون",
    "ريفاستجمين", "موبيروسين", "كليندامايسين", "أسيتيل", "هيدروكينون",
    "البروبيوتيك", "ميترونيدازول", "دونيبيزيل", "أمانتادين", "بيلوكاربين",
    "سيكلوسبورين", "تريتينوين", "ريتينول", "الحديد", "أوفلوكساسين",
    "فلوروميثولون", "بريدنيزولون", "سيفيكسيم", "سيفدينير", "كربونات", "كارباشول", 
]

# frequency list

In [27]:
frequency_patterns = [
    ## 🔹 English Frequencies
    r"\bevery\s\d+\shours?\b",            # "every 6 hours"
    r"\bEvery\shours\b",
    r"\bevery\s\d+\s?-\s?\d+\shours?\b",  # "every 4-6 hours"
    r"\bonce\sdaily\b",                   # "once daily"
    r"\btwice\sdaily\b",                  # "twice daily"
    r"\bthree\stimes\sa\sday\b",          # "three times a day"
    r"\bfour\stimes\sa\sday\b",           # "four times a day"
    r"\b\d+\stimes\sa\sday\b",            # "5 times a day"
    
    ## ⏳ Time-Based
    r"\bevery\sother\sday\b",             # "every other day"
    r"\bevery\s\d+\sdays?\b",             # "every 3 days"
    r"\bevery\s\d+\sweeks?\b",            # "every 2 weeks"
    r"\bevery\s\d+\smonths?\b",           # "every 6 months"
    
    ## 🌙 Morning/Evening
    r"\bin\sthe\smorning\b",              # "in the morning"
    r"\bin\sthe\sevening\b",              # "in the evening"
    r"\bin\sthe\safternoon\b",            # "in the afternoon"
    r"\bin\sthe\snight\b",                # "in the night"
    r"\bdaily\sat\snoon\b",               # "daily at noon"
    
    ## 🍽 Meal-Based
    r"\bbefore\smeals?\b",                # "before meals"
    r"\bbefore\sbreakfast?\b",            # "before breakfast"
    r"\bafter\smeals?\b",                 # "after meals"
    r"\bafter\sbreakfast?\b",             # "after breakfast"
    r"\bbefore\sfood\b",                  # "before food"
    r"\bafter\sfood\b",                   # "after food"
    r"\bon\san\sempty\sstomach\b",        # "on an empty stomach"
    
    ## 🌙 Sleep
    r"\bbefore\sbedtime\b",               # "before bedtime"
    r"\bat\sbedtime\b",                   # "at bedtime"
    r"\bbefore\sgoing\sto\sbed\b",        # "before going to bed"
    
    ## 🔄 PRN (As Needed)
    r"\bas\sneeded\b",                    # "as needed"
    r"\bif\sneeded\b",                    # "if needed"
    r"\bwhen\snecessary\b",               # "when necessary"
    r"\bwhen\srequired\b",                # "when required"
    r"\bwhen\sfeeling\spain\b",           # "when feeling pain"
    
    ## 🚑 Perioperative
    r"\bbefore\ssurgery\b",               # "before surgery"
    r"\bafter\ssurgery\b",                # "after surgery"
    r"\bbefore\san\soperation\b",         # "before an operation"
    r"\bafter\san\soperation\b",          # "after an operation"

    ## 🔹 Arabic Frequencies
    r"\bكل\s\d+\sساعة\b",              # "كل 8 ساعات" (every X hours)
    r"\bكل\s\d+\s?-\s?\d+\sساعات?\b",  # "كل 4-6 ساعات" (every X-Y hours)
    r"\bمرة\sيوميا\b",                 # "مرة يوميًا" (once daily)
    r"\bمرة\sكل\sيوم\b",               # "مرة كل يوم" (once per day)
    r"\bمرة\sيوميا\b",                 # "مرة يوميا" (once per day)
    r"\bمرة\sأسبوعيا\b",               # "مرة أسبوعيًا" (once weekly)
    r"\bمرة\sكل\sأسبوع\b",             # "مرة كل أسبوع" (once per week)
    r"\bمرة\sشهريا\b",                 # "مرة شهريًا" (once monthly)
    r"\bمرة\sكل\sشهر\b",               # "مرة كل شهر" (once per month)
    r"\bمرتين\sيوميا\b",               # "مرتين يوميًا" (twice daily)
    r"\b\d+\sمرات?\sيوميا\b",          # "3 مرات يوميًا" (multiple times daily)
    
    ## ⏳ Time-Based
    r"\bكل\sيومين\b",                   # "كل يومين" (every other day)
    r"\bكل\s\d+\sأيام\b",               # "كل 3 أيام" (every X days)
    r"\bكل\s\d+\sأسابيع\b",             # "كل 2 أسابيع" (every X weeks)
    r"\bكل\s\d+\sشهور\b",               # "كل 6 شهور" (every X months)

   ## Arabic (Word-Based Numbers)
    r"\bكل\sساعة\b",                     # "كل واحدة ساعة" (every one hour)
    r"\bكل\sساعتين\b",                   # "كل اثنتين ساعة" (every two hours)
    r"\bكل\sثلاث\sساعات\b",               # "كل ثلاث ساعات" (every three hours)
    r"\bكل\sأربع\sساعات\b",              # "كل أربع ساعات" (every four hours)
    r"\bكل\sخمس\sساعات\b",               # "كل خمس ساعات" (every five hours)
    r"\bكل\sست\sساعات\b",                # "كل ست ساعات" (every six hours)
    r"\bكل\sسبع\sساعات\b",               # "كل سبع ساعات" (every seven hours)
    r"\bكل\sثماني\sساعات\b",             # "كل ثماني ساعات" (every eight hours)
    r"\bكل\sتسع\sساعات\b",               # "كل تسع ساعات" (every nine hours)
    r"\bكل\sعشر\sساعات\b",               # "كل عشر ساعات" (every ten hours)
    r"\bكل\sإحدى\sعشرة\sساعة\b",         # "كل إحدى عشرة ساعة" (every 11 hours)
    r"\bكل\sاثنتي\sعشرة\sساعة\b",        # "كل اثنتي عشرة ساعة" (every 12 hours)
    
    ## 🌙 Morning/Evening
    r"\bفي\sالصباح\b",                  # "في الصباح" (in the morning)
    r"\bفي\sالمساء\b",                  # "في المساء" (in the evening)
    r"\bفي\sالظهيرة\b",                 # "في الظهيرة" (at noon)
    r"\bفي\sالليل\b",                   # "في الليل" (at night)
    
    ## 🍽 Meal-Based
    r"\bقبل\sالأكل\b",                  # "قبل الأكل" (before meals)
    r"\bبعد\sالأكل\b",                  # "بعد الأكل" (after meals)
    r"\bقبل\sالطعام\b",                 # "قبل الطعام" (before food)
    r"\bبعد\sالطعام\b",                 # "بعد الطعام" (after food)
    r"\bعلى\sمعدة\sفارغة\b",            # "على معدة فارغة" (on an empty stomach)
    r"\bعلى\sالريق\b",                   # "على الريق" (fasting)
    
    ## 🌙 Sleep
    r"\bقبل\sالنوم\b",                 # "قبل النوم" (before sleep)
    r"\bعند\sالنوم\b",                 # "عند النوم" (at bedtime)
    
    ## 🔄 PRN (As Needed)
    r"\bعند\sاللزوم\b",                # "عند اللزوم" (as needed)
    r"\bحسب\sالحاجة\b",                # "حسب الحاجة" (as required)
    r"\bإذا\sاستدعت\sالحاجة\b",         # "إذا استدعت الحاجة" (if necessary)
    r"\bعند\sالشعور\sبالألم\b",          # "عند الشعور بالألم" (when feeling pain)
    
    ## 🚑 Perioperative
    r"\bقبل\sالعملية\b",              # "قبل العملية" (before surgery)
    r"\bبعد\sالعملية\b",              # "بعد العملية" (after surgery)
    r"\bقبل\sالتدخل\sالجراحي\b",     # "قبل التدخل الجراحي" (before an operation)
    r"\bبعد\sالتدخل\sالجراحي\b",     # "بعد التدخل الجراحي" (after an operation)
]

In [5]:
# Load dataset (Assuming CSV)
df = pd.read_excel("Train.xlsx")

# Specify the column containing extracted prescription text
target_column = "Prescription"

In [6]:
df[target_column].head

<bound method NDFrame.head of 0                                                                                Carbachol Every 8 hours
1         Ofloxacin كل ست  ساعات , Brimonidine كل ٦ ساعات , كارباشول عند اللزوم , Latanoprost مرة يومياً
2                                                                           Brimonidine Before breakfast
3          Carbachol عند اللزوم , Tobramycin عند اللزوم , Pilocarpine كل ٦ ساعات , لاتانوبروست قبل النوم
4                                                      Fluorometholone As needed , Latanoprost As needed
                                                     ...                                                
618    Doxycycline مرة يومياً , كليندامايسين قبل النوم , موبيروسين مرتين يومياً , كلورهيكسيدين قبل النوم
619                                                                              ديكلوفيناك مرتين يومياً
620                                                                                  Nystatin بعد الغداء
621                      

In [7]:
def extract_medicine_and_frequency(text):
    """
    Extracts medicine names and dosage frequency while maintaining the original order.
    Returns a list of dictionaries [{medicine: "name", frequency: "value"}].
    """
    if not isinstance(text, str):  # Ensure text is a string
        return []
    
    structured_output = []
    
    # Combine medicine names and frequency patterns into a single regex pattern
    combined_pattern = r"|".join(
        [re.escape(med) for med in medicine_list] + frequency_patterns
    )

    # Find all matches in the order they appear in the text
    matches = re.findall(combined_pattern, text, re.IGNORECASE)
    
    current_medicine = None  # Track last detected medicine
    
    for match in matches:
        if match in medicine_list:  
            current_medicine = match  # Update the latest detected medicine
            structured_output.append({"medicine": match, "frequency": ""})  # Default frequency
        
        elif current_medicine:  
            # Assign the frequency to the last detected medicine **only if it does not have one yet**
            if structured_output[-1]["frequency"] == "":
                structured_output[-1]["frequency"] = match  
            else:
                # If previous medicine already has a frequency, store this frequency separately
                structured_output.append({"medicine": current_medicine, "frequency": match})
    
    return structured_output

In [8]:
# Apply extraction function to dataset
df["structured_prescriptions"] = df[target_column].apply(extract_medicine_and_frequency)

# Convert the entire dataset into a list of lists of dictionaries
structured_output = df["structured_prescriptions"].tolist()

# Print the final structured output
df["structured_prescriptions"].head()

0                                                                                                                                                             [{'medicine': 'Carbachol', 'frequency': 'Every 8 hours'}]
1                [{'medicine': 'Ofloxacin', 'frequency': ''}, {'medicine': 'Brimonidine', 'frequency': ''}, {'medicine': 'كارباشول', 'frequency': 'عند اللزوم'}, {'medicine': 'Latanoprost', 'frequency': 'مرة يوميا'}]
2                                                                                                                                                        [{'medicine': 'Brimonidine', 'frequency': 'Before breakfast'}]
3    [{'medicine': 'Carbachol', 'frequency': 'عند اللزوم'}, {'medicine': 'Tobramycin', 'frequency': 'عند اللزوم'}, {'medicine': 'Pilocarpine', 'frequency': ''}, {'medicine': 'لاتانوبروست', 'frequency': 'قبل النوم'}]
4                                                                                                    [{'medicine': 'Fluorometholone', 'f

In [9]:
def extract_medicine_and_frequency(text):
    if not isinstance(text, str):  # Ensure text is a string
        return []
    
    structured_output = []
    
    # Combine medicine names and frequency patterns into a single regex pattern
    combined_pattern = r"|".join(
        [re.escape(med) for med in medicine_list] + frequency_patterns
    )

    # Find all matches in the order they appear in the text
    matches = re.findall(combined_pattern, text, re.IGNORECASE)
    
    current_medicine = None  # Track last detected medicine
    
    for match in matches:
        if match in medicine_list:  
            current_medicine = match  # Update the latest detected medicine
            structured_output.append({"medicine": match, "frequency": ""})  # Default frequency
        
        elif current_medicine:  
            # Assign the frequency to the last detected medicine **only if it does not have one yet**
            if structured_output[-1]["frequency"] == "":
                structured_output[-1]["frequency"] = match  
            else:
                # If previous medicine already has a frequency, store this frequency separately
                structured_output.append({"medicine": current_medicine, "frequency": match})
    
    return structured_output

def remove_diacritics(text):
    # Normalize the text to decompose characters into base characters and diacritics
    normalized_text = unicodedata.normalize('NFKD', text)
    
    # Filter out combining characters (diacritics)
    cleaned_text = ''.join(
        char for char in normalized_text
        if not unicodedata.combining(char)
    )
    return cleaned_text

In [10]:
# text = "Ofloxacin كل ست ساعات , Brimonidine كل ٦ ساعات , كارباشول عند اللزوم , Latanoprost مرة يومياً"
# text = "Carbachol عند اللزوم , Tobramycin عند اللزوم , Pilocarpine كل ٦ ساعات , لاتانوبروست قبل النوم"
text = "Doxycycline مرة يومياً , كليندامايسين قبل النوم , موبيروسين مرتين يومياً , كلورهيكسيدين قبل النوم"

text = remove_diacritics(text)
result = extract_medicine_and_frequency(text)
result

[{'medicine': 'Doxycycline', 'frequency': 'مرة يوميا'},
 {'medicine': 'كليندامايسين', 'frequency': 'قبل النوم'},
 {'medicine': 'موبيروسين', 'frequency': 'مرتين يوميا'},
 {'medicine': 'كلورهيكسيدين', 'frequency': 'قبل النوم'}]

# extract function

In [None]:
DEFAULT_FREQUENCY = "Every 6 hours"

def extract_medicine_and_frequency(sentence, medicine_list, frequency_patterns):
    extracted_medicines = []
    extracted_frequencies = []
    medicine_positions = {}
    frequency_positions = {}

    # Step 1: Extract medicines and their positions
    for med in medicine_list:
        match = re.search(rf"\b{re.escape(med)}\b", sentence)
        if match:
            extracted_medicines.append(med)
            medicine_positions[med] = match.start()

    # Step 2: Extract frequencies and their positions
    for pattern in frequency_patterns:
        pattern_re = re.compile(pattern)
        matches = pattern_re.finditer(sentence)
        for match in matches:
            extracted_frequencies.append(match.group())
            frequency_positions[match.group()] = match.start()

    # Step 3: Sort medicines and frequencies by their position
    sorted_medicines = sorted(medicine_positions.items(), key=lambda x: x[1])
    sorted_frequencies = sorted(frequency_positions.items(), key=lambda x: x[1])

    # Step 4: Pair medicines with frequencies
    extracted_results = []
    freq_index = 0
    last_known_frequency = None
    
    for i, (med, med_pos) in enumerate(sorted_medicines):
        extracted_frequency = "Unknown"
        next_med_pos = sorted_medicines[i + 1][1] if i + 1 < len(sorted_medicines) else len(sentence)

        # Check if there are words between this medicine and the next
        words_between = sentence[med_pos + len(med):next_med_pos].strip()
        if words_between and len(words_between.split()) <= 3:
            extracted_frequency = words_between
        else:
            # Try to match frequency after the medicine (if ordered correctly)
            while freq_index < len(sorted_frequencies):
                freq, freq_pos = sorted_frequencies[freq_index]
                if freq_pos > med_pos and freq_pos < next_med_pos:
                    extracted_frequency = freq
                    last_known_frequency = freq
                    freq_index += 1
                    break
                freq_index += 1

            # If no direct match, use last known frequency
            if extracted_frequency == "Unknown" and last_known_frequency:
                extracted_frequency = last_known_frequency

            # If still unknown, use default
            if extracted_frequency == "Unknown":
                extracted_frequency = DEFAULT_FREQUENCY

        extracted_results.append((med, extracted_frequency))
    
    return extracted_results

[('Doxycycline', 'مرة يومياً ,'), ('كليندامايسين', 'قبل النوم ,'), ('موبيروسين', 'مرتين يومياً ,'), ('كلورهيكسيدين', 'قبل النوم')]
[('Paracetamol', 'hours Every'), ('Oral Amoxicillin', 'Twice daily'), ('Amoxicillin', 'other lunch After'), ('Erythromycin', 'Twice daily'), ('Nystatin', 'Twice daily')]


In [None]:
# text = "Doxycycline مرة يومياً , كليندامايسين قبل النوم , موبيروسين مرتين يومياً , كلورهيكسيدين قبل النوم"
text = "Paracetamol hours Every Oral Amoxicillin other lunch After Erythromycin Nystatin"

print(text)
text = remove_diacritics(text)
result = extract_medicine_and_frequency(text, medicine_list, frequency_patterns)
result

الببتيدات ساعة كل الساليسيليك كربونات اللاكتيك الجليكوليك حمض اللزوم عند


[('الببتيدات', 'ساعة كل الساليسيليك'), ('كربونات', 'Twice daily')]