* Import Libraries

In [1]:
import pytesseract
from PIL import Image
import re
import pandas as pd
import cv2

* Extract Text from an Image using pytesseract

<pre>
To demonstrate text extraction from an image, we'll load an image,
 convert it to grayscale, we'll convert the grayscale image to black and white before performing OCR.
</pre>




In [None]:
# Load image
img = cv2.imread('Load path of img ')

# Convert the image to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Convert the grayscale image to black and white
(thresh, black_and_white) = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)

# Perform OCR on the black and white image
text = pytesseract.image_to_string(black_and_white)

 # Split extracted text into words
txt=text.lower()
words = txt.split()

* Dictionary containing patterns for extracting feature values 

<pre>
for extracting values related to Complete Blood Count (CBC) and Liver Function Test (LFT) features:
</pre>




In [None]:
cbc_feature = {
    'Hemoglobin (HGB)': ['hgb', 'haemoglobin', 'hemoglobin'],
    'Mean Corpuscular Volume (MCV)': ['mcv', 'mean corpuscular volume', 'm.c.v'],
    'Mean Corpuscular Hemoglobin (MCH)': ['mch', 'mean corpuscular hemoglobin', 'm.c.h'],
    'Mean Corpuscular Hemoglobin Concentration (MCHC)': ['mchc', 'mean corpuscular hemoglobin concentration', 'm.c.h.c'],
}

lft_feature = {
    'Alanine Aminotransferase (ALT)': ['alt', 'sgpt'],
    'Aspartate Aminotransferase (AST)': ['ast', 'sgot'],
    'Alkaline Phosphatase (ALP)': ['alp'],
    
    'Total Bilirubin': ['tbil', 'tb'],
    'Direct Bilirubin': ['dbil', 'db'],
    'Albumin': ['alb'],
    'Prothrombin Time (PT)': ['pt'],
    'A/G Ratio': ['ag ratio'],
    'Total Proteins': ['tp', 'total protein']
}

* Here's the regenerated code specifically tailored for extracting values 
  related to Complete Blood Count (CBC) and Liver Function Test (LFT) features:

  

In [None]:
def extract_ValuesOf_features(text, feature_keywords):
   
    feature_values = []
    # Define a regex pattern to match numbers
    number_pattern ='[0-9]+\.[0-9]+?'
    for fature_name ,alternative_name in feature_keywords :
        for alt in alternative_name:
            indices=[i for i,word in enumerate(text) if alt==word]
        for index in indices:
            current_index=index+1
            while current_index <len(text):
                match=re.search(number_pattern,text[current_index])

                if match:
                    try:
                        value = float(match)
                        feature_values.append(value)
                        break
                    except ValueError:
                                
                                pass
                current_index += 1
    return feature_values

* Detect medical test types 

In [None]:
def detect_medical_test(list_text):
    # Define the keywords to look for in each feature
    cbc_keywords = ['cbc', 'complete', 'blood ', 'hematology ']
    lft_keywords = ['lft', 'liver', 'function ', 'biochemistry ']

    for feature in list_text:
        if any(cbc_item in list_text.lower() for cbc_item in cbc_keywords):
            print("CBC test detected")
            
            cbc= extract_ValuesOf_features(list_text, cbc_feature) 
            return cbc
            
        elif any(lft_item in list_text.lower() for lft_item in lft_keywords):
            print("LFT test detected")
            
            lft= extract_ValuesOf_features(list_text, lft_feature)
            return lft

    # No matching test type was found
    return "This type of test is not supported"
