In [1]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from transformers import TextStreamer
import os
import torch

model_id = 'llm'

tokenizer = AutoTokenizer.from_pretrained(model_id)
print("Loading Model... \n\n")
model = AutoModelForCausalLM.from_pretrained(
    model_id
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading Model... 




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [125]:
def extract_info_from_ocr(ocr_text):

    inst = f"""Extract the medication name from this OCR text of a medicine package:
    {ocr_text} 

    Return only the generic medication name, manufacturer/laboratory, importer, dosage, and packaging quantity of the OCR text, don't add addresses or any labeling. Write NOT FOUND if any info is missing. Return them seperated by |."""

    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": inst},
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    input_text=tokenizer.apply_chat_template(messages, tokenize=False)

    # Tokenize the sample
    inputs = tokenizer([input_text], return_tensors='pt')

    # Call generate on the inputs
    out = model.generate(
        **inputs,
        max_new_tokens=96,
        streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False
    )

    extracted_query = tokenizer.batch_decode(out)[0]
    #extracted_query = extracted_query[extracted_query.index('<|im_end|>\n<|im_start|>system\n')+len('<|im_end|>\n<|im_start|>system\n'):]
    #extracted_query = extracted_query.replace('<|im_end|>', '')
    extracted_query = extracted_query[len(input_text):]
    extracted_query = extracted_query.replace("<|im_start|>system", "").replace("<|im_end|>", "")
    return extracted_query.strip()


In [None]:
result = "'DONOTACCEPT IF SEAL 100 TABLETS IS BROKEN Allopurinol Llanol@ 100 mg Tablet Antigout R Manufactured by AMHERST LABORATORIES, INC. UNILAB Pharma Campus, Barangay Mamplasan Binan Laguna Philippines for UNILAB, Inc. No.66 United Street Mandaluyong CityMetro ManilaPhilippines'"
extracted_result = extract_info_from_ocr(result)
extracted_result

system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user
Extract the medication name from this OCR text of a medicine package:
    Levothyroxine sodium Euthyrox 50 mcg Tablet Thyroid Hormone Replacement CK R Imported by Merck Inc. 36th Floor,The Finance Center 26th Street corner 9th Avenue, Bonifacio Global City,Taguig 100 Tablets MERCK 

    Return only the generic medication name, manufacturer/laboratory, importer, dosage, and packaging quantity of the OCR text, don't add addresses or any labeling. Write NOT FOUND if any info is missing. Return them seperated by |.




KeyboardInterrupt: 

## Matching

In [85]:
extracted_result

'\nAllopurinol|Amherst Laboratories, Inc.|100 mg|100 tablets'

In [123]:
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np
import re

fda_df = pd.read_csv('FDA_ALL.csv')
rx_df = pd.read_csv('RX_ALL.csv')


def match_with_fda(match_string, category=None, indexes=None):
    matches = []
    scores = []

    if indexes is None:
        indexes = list(range(len(fda_df)))

    for i in indexes:
        
        if category is None:
            medication_entry = str(fda_df.iloc[i]['Generic Name']) + str(fda_df.iloc[i]['Brand Name']) + str(fda_df.iloc[i]['Manufacturer']) + str(fda_df.iloc[i]['Dosage Strength'])  + str(fda_df.iloc[i]['Packaging'])
        else:
            medication_entry = str(fda_df.iloc[i][category])
        score = fuzz.token_set_ratio(medication_entry.lower(), match_string.lower())

        matches.append(fda_df.iloc[i]['INDEX'])
        scores.append(score)
    
    matches, scores = zip(*sorted(zip(matches, scores), key=lambda x: x[1], reverse=True))
    return matches, scores

def match_with_rx(match_string):
    matches = []
    scores = []

    for i in range(len(rx_df)):
        
        medication_entry = str(rx_df.iloc[i]['Drug_Name'])
        score = fuzz.ratio(medication_entry.lower(), match_string.lower())
        matches.append(rx_df.iloc[i]['INDEX'])
        scores.append(score)
    
    matches, scores = zip(*sorted(zip(matches, scores), key=lambda x: x[1], reverse=True))
    return matches, scores

def get_info(match_string, limit=3):
    fda_matches, fda_scores = match_with_fda(match_string=match_string)
    matches = []

    for match, score in zip(fda_matches[:limit], fda_scores[:limit]):
        match_entry = dict(fda_df.iloc[match])
        match_entry['match_score'] = score
        generic_name = match_entry['Generic Name']
        rx_matches, _ = match_with_rx(generic_name)
        best_rx_match = dict(rx_df.iloc[rx_matches[0]])
        match_entry['rx_info'] = best_rx_match
        matches.append(match_entry)

    return matches

def get_info2(match_string, top_n=10, limit=3):
    search_terms = match_string.split('|')
    search_categories = ['Generic Name', 'Manufacturer', 'Dosage Strength', 'Packaging']

    #filtered_scores = [0 for _ in range(len(fda_df))]
    filtered_matches = list(range(len(fda_df)))

    # for term, cat in zip(search_terms, search_categories):
    #     matches, scores = match_with_fda(term, cat, filtered_matches)
    #     filtered_matches, filtered_scores = matches[:top_n], scores[:top_n]
    #     cumm_scores = list(np.array(scores[:top_n]) + np.array(filtered_scores[:top_n]))
    #     filtered_matches, filtered_scores = matches[:top_n], cumm_scores[:top_n]
        
    # filtered_scores = list(np.array(filtered_scores)/len(search_categories))

    weights = np.array([0.3, 0.3, 0.2, 0.2])  # Weights for each category
    filtered_scores = np.zeros(top_n)  # Initialize scores

    for i, (term, cat) in enumerate(zip(search_terms, search_categories)):
        matches, scores = match_with_fda(term, cat, filtered_matches)
        scores = np.array(scores[:top_n]) * weights[i]  # Apply weight
        filtered_scores += scores  # Accumulate weighted scores
        filtered_matches = matches[:top_n]  # Keep track of filtered matches

    filtered_scores = list(filtered_scores)  # Convert back to list if needed


    matches = []

    for match, score in zip(filtered_matches[:limit], filtered_scores[:limit]):
        match_entry = dict(fda_df.iloc[match])
        match_entry['match_score'] = score
        generic_name = match_entry['Generic Name']
        rx_matches, _ = match_with_rx(generic_name)
        best_rx_match = dict(rx_df.iloc[rx_matches[0]])
        match_entry['rx_info'] = best_rx_match
        matches.append(match_entry)

    return matches

#print(get_info('Allopurinol Llanole 100 mg Tablet Amherst Laboratories, Inc. Allopurinol'))

In [None]:
get_info2('', 5, 5)

[{'INDEX': 69,
  'Registration Number': 'DR-X8671',
  'Generic Name': 'Allopurinol',
  'Brand Name': 'Llanol',
  'Dosage Strength': '300 mg',
  'Dosage Form': 'Tablet',
  'Classification': 'Prescription Drug (RX)',
  'Packaging': "Aluminum foil strip x 4's (Box of 100's)",
  'Pharmacologic Category': '-',
  'Manufacturer': 'Amherst Laboratories, Inc.',
  'Country of Origin': 'Philippines',
  'Trader': 'UNILAB, Inc.',
  'Importer': nan,
  'Distributor': nan,
  'Application Type': '-',
  'Issuance Date': '11-May-20',
  'Expiry Date': '30-May-25',
  'match_score': 88.6,
  'rx_info': {'INDEX': 313,
   'Drug_Name': 'Allopurinol (Zyloprim)',
   'URL': 'https://www.rxlist.com/zyloprim-drug.htm'}},
 {'INDEX': 67,
  'Registration Number': 'DR-X7781',
  'Generic Name': 'Allopurinol',
  'Brand Name': 'Purinase',
  'Dosage Strength': '300 mg',
  'Dosage Form': 'Tablet',
  'Classification': 'Prescription Drug (RX)',
  'Packaging': "Foil strip x 10's (Box of 50's)",
  'Pharmacologic Category': '-',
