In [8]:
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np

fda_df = pd.read_csv('FDA_ALL.csv')
rx_df = pd.read_csv('RX_ALL.csv')


def match_with_fda(match_string, category=None, indexes=None):
    matches = []
    scores = []

    if indexes is None:
        indexes = list(range(len(fda_df)))

    for i in indexes:
        
        if category is None:
            medication_entry = str(fda_df.iloc[i]['Generic Name']) + str(fda_df.iloc[i]['Brand Name']) + str(fda_df.iloc[i]['Manufacturer']) + str(fda_df.iloc[i]['Dosage Strength']) + str(fda_df.iloc[i]['Packaging']) + str(fda_df.iloc[i]['Importer'])
        else:
            medication_entry = str(fda_df.iloc[i][category])
        score = fuzz.token_set_ratio(medication_entry.lower(), match_string.lower())

        matches.append(fda_df.iloc[i]['INDEX'])
        scores.append(score)
    
    matches, scores = zip(*sorted(zip(matches, scores), key=lambda x: x[1], reverse=True))
    return matches, scores

In [44]:
from fuzzywuzzy import fuzz

def match_with_fda_hierarchical(match_string, threshold=70):
    match_string = match_string.lower().strip()
    
    matches = []
    scores = []

    def safe_str(value):
        """ Convert NaN or None to an empty string """
        return str(value).lower().strip() if pd.notna(value) else ""

    # Stage 1: Match by Generic Name + Brand Name (High Priority)
    first_pass = []
    for i in range(len(fda_df)):
        generic_brand = f"{safe_str(fda_df.iloc[i]['Generic Name'])} {safe_str(fda_df.iloc[i]['Brand Name'])}"
        score = fuzz.token_set_ratio(generic_brand, match_string)
        if score >= threshold:
            first_pass.append((i, score))

    if not first_pass:  # If no strong matches, relax threshold slightly
        first_pass = [(i, fuzz.token_set_ratio(safe_str(fda_df.iloc[i]['Generic Name']), match_string)) 
                      for i in range(len(fda_df))]

    # Stage 2: Filter by Pharmacologic Category
    second_pass = []
    for i, prev_score in first_pass:
        category = safe_str(fda_df.iloc[i]['Pharmacologic Category'])
        score = fuzz.token_set_ratio(category, match_string)
        final_score = (prev_score * 0.7) + (score * 0.3)
        if score >= threshold:
            second_pass.append((i, final_score))

    # Stage 3: Manufacturer & Dosage Strength
    third_pass = []
    for i, prev_score in second_pass:
        manufacturer = safe_str(fda_df.iloc[i]['Manufacturer'])
        dosage = safe_str(fda_df.iloc[i]['Dosage Strength'])

        manufacturer_score = fuzz.token_set_ratio(manufacturer, match_string)
        dosage_score = fuzz.token_set_ratio(dosage, match_string)

        final_score = (prev_score * 0.6) + (manufacturer_score * 0.15) + (dosage_score * 0.25)

        if dosage_score >= threshold or manufacturer_score >= threshold:
            third_pass.append((i, final_score))

    # Stage 4: Final Refinement with Packaging & Importer
    for i, prev_score in third_pass:
        packaging = safe_str(fda_df.iloc[i]['Packaging'])
        importer = safe_str(fda_df.iloc[i]['Importer'])

        packaging_score = fuzz.token_set_ratio(packaging, match_string)
        importer_score = fuzz.token_set_ratio(importer, match_string)

        final_score = (prev_score * 0.8) + (packaging_score * 0.1) + (importer_score * 0.1)

        matches.append(fda_df.iloc[i]['INDEX'])
        scores.append(final_score)

    # Sort by final score
    if matches:
        matches, scores = zip(*sorted(zip(matches, scores), key=lambda x: x[1], reverse=True))
    
    return matches, scores


## OCR

In [16]:
import ocr_module

[2025/02/21 15:38:25] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\Jandrik/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\Jandrik/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_

In [33]:
ocr_text = ocr_module.extract_image_text('test6.jpg')
ocr_text

[2025/02/21 15:47:49] ppocr DEBUG: dt_boxes num : 16, elapsed : 0.13587570190429688
[2025/02/21 15:47:49] ppocr DEBUG: cls num  : 16, elapsed : 0.07323408126831055
[2025/02/21 15:47:50] ppocr DEBUG: rec_res num  : 16, elapsed : 1.2540688514709473


'100 Tablets MEDIZOLE 500 mg Film-Coated Tablet ANTIPROTOZOAL R Manufactured by: Imported & Distributed by MEDICO REMEDIES LIMITED I.E.Medica Inc. 8 &9,Dewan &Sons Udyog Nagar Lokmanya Nagar, MEDICO 5/F RFM Corporate Center Pioneer St. Palghar,Dist.Thane-401404Maharashtra India Mandaluyong City,Philippines'

In [48]:
ocr_text

'100 Tablets MEDIZOLE 500 mg Film-Coated Tablet ANTIPROTOZOAL R Manufactured by: Imported & Distributed by MEDICO REMEDIES LIMITED I.E.Medica Inc. 8 &9,Dewan &Sons Udyog Nagar Lokmanya Nagar, MEDICO 5/F RFM Corporate Center Pioneer St. Palghar,Dist.Thane-401404Maharashtra India Mandaluyong City,Philippines'

In [49]:
#ocr = "Levothyroxine sodium Euthyrox 50 mcg Tablet Thyroid Hormone Replacement CK R Imported by Merck Inc. 36th Floor,The Finance Center 26th Street corner 9th Avenue, Bonifacio Global City,Taguig 100 Tablets MERCK"
m, s = match_with_fda_hierarchical(ocr_text, 70)
for e, i in enumerate(m[:3]):
    print('-'*50)
    print(fda_df.iloc[i])
    print(s[e])

--------------------------------------------------
INDEX                                                                5521
Registration Number                                              DRP-6688
Generic Name                                                Metronidazole
Brand Name                                                       Medizole
Dosage Strength                                                    500 mg
Dosage Form                                            Film Coated Tablet
Classification                                     Prescription Drug (RX)
Packaging                 Alu/Clear PVC Blister pack x 10's (Box of 30's)
Pharmacologic Category                                      Antiprotozoal
Manufacturer                                    Medico Remedies Pvt. Ltd.
Country of Origin                                                   India
Trader                                                                NaN
Importer                                                   Me

## Testing

In [35]:
import medication_matching

In [55]:
medication_matching.get_info(ocr_text)

[{'INDEX': 5521,
  'Registration Number': 'DRP-6688',
  'Generic Name': 'Metronidazole',
  'Brand Name': 'Medizole',
  'Dosage Strength': '500 mg',
  'Dosage Form': 'Film Coated Tablet',
  'Classification': 'Prescription Drug (RX)',
  'Packaging': "Alu/Clear PVC Blister pack x 10's (Box of 30's)",
  'Pharmacologic Category': 'Antiprotozoal',
  'Manufacturer': 'Medico Remedies Pvt. Ltd.',
  'Country of Origin': 'India',
  'Trader': nan,
  'Importer': 'MedEthix, Inc.',
  'Distributor': 'MedEthix, Inc.',
  'Application Type': 'Renewal',
  'Issuance Date': '19-Dec-23',
  'Expiry Date': '20-May-29',
  'match_score': 52.903999999999996,
  'rx_info': {'INDEX': 3106,
   'Drug_Name': 'Flagyl (Metronidazole)',
   'URL': 'https://www.rxlist.com/flagyl-drug.htm'}},
 {'INDEX': 10857,
  'Registration Number': 'DR-XY47670',
  'Generic Name': 'Metronidazole',
  'Brand Name': 'Sydenzole',
  'Dosage Strength': '500 mg',
  'Dosage Form': 'Tablet',
  'Classification': 'Prescription Drug (Rx)',
  'Packagin

In [40]:
fda_df.isna().sum()

INDEX                         0
Registration Number           0
Generic Name                 18
Brand Name                 2100
Dosage Strength              39
Dosage Form                   0
Classification                5
Packaging                    20
Pharmacologic Category        0
Manufacturer                 12
Country of Origin           119
Trader                    13498
Importer                   6380
Distributor                4536
Application Type              4
Issuance Date                 2
Expiry Date                   0
dtype: int64