## Importing Libraries, Loading Data and Verifying

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import json
from PIL import Image


In [None]:
df_main = pd.read_csv(r'D:\rximage\table.csv')

### Checking Size of the Dataframe and Looking at Some of the Relevant Columns

In [None]:
df_main.shape

In [None]:
df_images = df_main[['RXNAV 1024', 'name']].copy()
df_images.columns = ['image_path', 'label']

In [None]:
df_main.columns

In [None]:
# Display the first few rows
print(df_images.head())

## Checking For Duplicates

I think this may be an error after going through all of the pictures, I believe several of the pills are the same medication but the actual pill itself is different. Will find an example and verify

In [None]:
# Total number of images
total_images = len(df_images)

# Number of unique pills
unique_pills = df_images['label'].nunique()

print(f"Total number of images: {total_images}")
print(f"Number of unique pills: {unique_pills}")

## Display a Few Pictures of the Images

Just to get an idea of what we are working with

In [None]:
# Function to show a few images from the dataset
def show_sample_images(df, num_samples=5):
    samples = df.sample(num_samples)
    plt.figure(figsize=(15, 5))
    for i, row in enumerate(samples.itertuples(), 1):
        img = cv2.imread(os.path.join('D:\\rximage', row.image_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.subplot(1, num_samples, i)
        plt.imshow(img)
        plt.title(row.label[:30])  # Shorten label for display
        plt.axis('off')
    plt.show()

show_sample_images(df_images)

## Get All of the Pill Information into a DataFrame

This has turned out to be a problem riddled task as I am not the most skilled in navigating json formatting at the moment. So a systematice approach in which i add a single column at a time in order to identify where the vast number of duplicated rows is coming from when I load it in it's entirety

First I'm going to load each pill's name in a row and check the total

In [None]:

with open('D:/rximage/MONGOexport/rximagesAll.json', 'r') as file:
    raw_data = file.readlines()

# parse each line as a separate JSON object
rximages_data = [json.loads(line) for line in raw_data]

# Flatten the data
flattened_data = []

for entry in rximages_data:
    # Extract the main fields
    pill_info = {
        'ndc11': entry.get('ndc11', ''),
        'rxcui': entry.get('rxcui', ''),
        'name': entry.get('name', ''),
        'labeler': entry.get('labeler', ''),
        'acqDate': entry.get('acqDate', ''),
        'attribution': entry.get('attribution', ''),
        'ingredientsAvailable': entry.get('ingredientsAvailable', False),
    }

    # Extract the image details
    pill_info['rxnavImageObjectId'] = entry.get('rxnavImageObjectId', '')
    pill_info['rxnavImageFileName'] = entry.get('rxnavImageFileName', '')
    pill_info['rxnavImageSize'] = entry.get('rxnavImageSize', 0)
    pill_info['nlmImageObjectId'] = entry.get('nlmImageObjectId', '')
    pill_info['nlmImageFileName'] = entry.get('nlmImageFileName', '')
    pill_info['nlmImageSize'] = entry.get('nlmImageSize', 0)

    # Extract the mpc (pill properties)
    mpc = entry.get('mpc', {})
    pill_info['shape'] = mpc.get('shape', '')
    pill_info['size'] = mpc.get('size', 0)
    pill_info['color'] = mpc.get('color', '')
    pill_info['imprint'] = mpc.get('imprint', '')
    pill_info['imprintType'] = mpc.get('imprintType', '')
    pill_info['symbol'] = mpc.get('symbol', False)
    pill_info['score'] = mpc.get('score', 0)

    # Append the flattened pill info to the list
    flattened_data.append(pill_info)

# Convert the flattened data into a DataFrame
df_rximages = pd.DataFrame(flattened_data)

# Display the first few rows
print(df_rximages.head())

### Active and Inactive Ingredients

It appears these were stored as lists, since they can potentially contain multiple unique ingredients so the solution I found was to unpack these seperately and append them on to the dataframe. I achieved this by checking each ingredients field for active and inactive and unpacking anything found, or defaulting to none if there were none

In [None]:
# Initialize the lists
active_ingredients = []
inactive_ingredients = []

with open('D:/rximage/MONGOexport/rximagesAll.json', 'r') as file:
    for line in file:
        record = json.loads(line)
        # Extract active ingredients or default to an empty list
        active = record.get("ingredients", {}).get("active", [])
        active_ingredients.append(active)
        
        # Extract inactive ingredients or default to an empty list
        inactive = record.get("ingredients", {}).get("inactive", [])
        inactive_ingredients.append(inactive)

# Add the new columns to the existing DataFrame
df_rximages['active_ingredients'] = active_ingredients
df_rximages['inactive_ingredients'] = inactive_ingredients

# Display the first few rows of the updated DataFrame
print(df_rximages.head())


#### Sanity Check

just looking at the newly created data frame and inspecting it for errors

##### checking random samples to see if the active_ingredients and inactive_ingredients fields are filled

In [None]:
df_rximages['active_ingredients'].sample(20)

In [None]:
df_rximages['inactive_ingredients'].sample(20)

##### Verifying that there are still 4392 rows

In [None]:
df_rximages.shape[0]

##### Saving Progress to CSV

In [None]:
df_rximages.to_csv('rximage_data.csv', index=False)

##### Verifying that it loads correctly from newly saved file

In [None]:
df = pd.read_csv("rximage_data.csv")

In [None]:
df.sample(1)

In [None]:
df.shape[0]

In [None]:
df.sample(20)

#### Unique Columns in Dataset

In [None]:
df_rximages.columns

## Finding the Size of the Crop Box

I want to remove the label on the bottom of the image and ideally the markers for measurement from each picture, this will hopefully make it easier to manipulate the images to create and artificial dataset by rotating the pills and pasting them on different backgrounds

In [None]:


def show_crop_area(image, bottom_crop_box, left_crop_box):
    # Display the image
    plt.imshow(image)
    
    # Draw the bottom crop area
    plt.gca().add_patch(plt.Rectangle(
        (bottom_crop_box[0], bottom_crop_box[1]), 
        bottom_crop_box[2] - bottom_crop_box[0], 
        bottom_crop_box[3] - bottom_crop_box[1], 
        linewidth=2, edgecolor='r', facecolor='none'
    ))
    
    # Draw the left crop area
    plt.gca().add_patch(plt.Rectangle(
        (left_crop_box[0], left_crop_box[1]), 
        left_crop_box[2] - left_crop_box[0], 
        left_crop_box[3] - left_crop_box[1], 
        linewidth=2, edgecolor='b', facecolor='none'
    ))

    plt.show()


with Image.open('D:/rximage/image/images/gallery/1024/00002-3228-30_RXNAVIMAGE10_391E1C80.jpg') as img:
    bottom_crop_box, left_crop_box = calculate_crop_boxes(img)  # Get the two crop boxes
    show_crop_area(img, bottom_crop_box, left_crop_box)  # Show the crop areas

After looking at several of the images the label on the bottom is indeed universal for every picture, but the marks on the side for measuring appear in different locations therefor I will only be cropping out the label at this moment

## Cropping Out the Labels on the Bottom of the Images

In [None]:
import os
from PIL import Image

def calculate_top_crop_box(image, top_percentage=0.8):
    width, height = image.size

    # Calculate the crop box for the top 80% of the image
    top_crop_height = int(height * top_percentage)  # 80% from the top
    bottom_crop_height = height - top_crop_height    # Remaining 20% (bottom)

    # Define the crop box for the top 80%
    top_crop_box = (0, 0, width, top_crop_height)

    return top_crop_box

def crop_and_save_images(input_folder, output_folder, top_percentage=0.8):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Iterate over each image in the input folder
    for filename in os.listdir(input_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):  # Process only image files
            image_path = os.path.join(input_folder, filename)
            with Image.open(image_path) as img:
                # Get the crop box for the top 80% portion
                top_crop_box = calculate_top_crop_box(img, top_percentage)
                
                # Crop the top 80% of the image
                top_img = img.crop(top_crop_box)
                
                # Save the cropped top portion (80% remaining part)
                top_output_path = os.path.join(output_folder, f"{filename}")
                top_img.save(top_output_path)
                print(f"Saved cropped top image: {top_output_path}")


input_folder = 'D:/rximage/image/images/gallery/1024'
output_folder = 'D:/rximage/image/images/cropped'
crop_and_save_images(input_folder, output_folder, top_percentage=0.8)

## Image Splitting

I want to split each picture in half to isolate each side of each pill. I don't think we should be training a model on each pill placed side by side in exactly the same fashion as any deviation in the real world could create errors. Splitting the pills also allows me to remove the background and augment the dataset to produce more images from the one we have.

In order to do this I noticed that there are the pictures all fall into one of two catagories

They are either pictured vertically, with one side located above the other

Or the are pictured Horizontally with one side depicted adjacent to the other

I was unable to come up with a programatic approach that would successfully split all of the images so I manually sorted all 4,300 or so images and seperated them

pills that are stacked vertically are stored in D:\rximage\image\images\vertical

pills that are stacked horizontally are stored in D:\rximage\image\images\horizontal

In [None]:
def split_and_show(image_path):
    # Open the image
    with Image.open(image_path) as img:
        width, height = img.size

        # Split the image into left and right halves
        left_img = img.crop((0, 0, (width // 2) + 22, height))
        right_img = img.crop(((width // 2) + 22 , 0, width, height))

        # Plot the original and split images side by side
        plt.figure(figsize=(15, 5))
        
        # Original Image
        plt.subplot(1, 3, 1)
        plt.imshow(img)
        plt.title('Original Image')
        plt.axis('off')
        
        # Left Half
        plt.subplot(1, 3, 2)
        plt.imshow(left_img)
        plt.title('Left Half')
        plt.axis('off')
        
        # Right Half
        plt.subplot(1, 3, 3)
        plt.imshow(right_img)
        plt.title('Right Half')
        plt.axis('off')

        plt.show()

# Path to the image you want to test
image_path = 'D:/rximage/image/images/horizontal/00049-2330-45_RXNAVIMAGE10_3C1C9E04.jpg'

# Run the function to visualize the split
split_and_show(image_path)

## Testing on Random Samples of Images to Make Sure it Works

In [None]:
import random

def test_random_images(folder_path, num_samples=10):
    # Get a list of all images in the folder
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    # Randomly select a few images
    samples = random.sample(image_files, min(num_samples, len(image_files)))

    # Test and visualize each sampled image
    for image_file in samples:
        image_path = os.path.join(folder_path, image_file)
        print(f"Showing splits for: {image_file}")
        split_and_show(image_path)

# Set the folder path and run the test
folder_path = 'D:/rximage/image/images/horizontal'
test_random_images(folder_path)

## Splitting Horizontal Images and Updating the DataFrame

After running the sampling function a few dozen times and verifying that none of the actual pill was cropped out of any of the images I think its safe to split them all and update the dataframe. Since all of the vertically stacked images were manually removed we are able to do so without any issues.

### Splitting and Saving the Images

In [None]:
def split_and_save_images(input_folder, output_folder):
    # Ensure the output directory exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all image files in the input folder
    for filename in os.listdir(input_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(input_folder, filename)
            
            # Open the image
            with Image.open(image_path) as img:
                width, height = img.size
                
                # Split the image into left and right halves
                left_img = img.crop((0, 0, (width // 2) + 22, height))
                right_img = img.crop(((width // 2) + 22, 0, width, height))
                
                # Construct new filenames
                base_name = os.path.splitext(filename)[0]
                left_filename = f"{base_name}_left.jpg"
                right_filename = f"{base_name}_right.jpg"
                
                # Save the split images
                left_img.save(os.path.join(output_folder, left_filename))
                right_img.save(os.path.join(output_folder, right_filename))
                
                print(f"Saved: {left_filename} and {right_filename}")

# Set the input and output directories
input_folder = 'D:/rximage/image/images/horizontal'
output_folder = 'D:/rximage/image/images/split'

# Run the function
split_and_save_images(input_folder, output_folder)

In [None]:
def split_and_update(df, input_folder, output_folder, file_column='rxnavImageFileName'):
    os.makedirs(output_folder, exist_ok=True)
    
    updated_rows = []

    for index, row in df.iterrows():
        image_path = row[file_column]
        full_image_path = os.path.join(input_folder, image_path)
        
        # Check if the file exists and is an image
        if os.path.exists(full_image_path) and image_path.lower().endswith(('.png', '.jpg', '.jpeg')):
            with Image.open(full_image_path) as img:
                width, height = img.size
                
                # Split the image into left and right halves required padding by 22 to not cut pills
                left_img = img.crop((0, 0, (width // 2)  + 22, height))
                right_img = img.crop(((width // 2) + 22 , 0, width, height))
                
                # Create new filenames
                base_name = os.path.splitext(os.path.basename(image_path))[0]
                left_filename = f"{base_name}_left.jpg"
                right_filename = f"{base_name}_right.jpg"
                
                # Save the new images
                left_img.save(os.path.join(output_folder, left_filename))
                right_img.save(os.path.join(output_folder, right_filename))
                
                # Create new rows for the split images
                left_row = row.copy()
                left_row[file_column] = os.path.join(output_folder, left_filename)
                left_row['position'] = 'left'
                
                right_row = row.copy()
                right_row[file_column] = os.path.join(output_folder, right_filename)
                right_row['position'] = 'right'
                
                # Add the new rows to the list
                updated_rows.append(left_row)
                updated_rows.append(right_row)
    
    # Create a new DataFrame from the updated rows
    updated_df = pd.DataFrame(updated_rows)
    
    return updated_df

input_folder = 'image/images/horizontal'
output_folder = 'image/images/split'
file_column = 'file_path'  

df_updated = split_and_update(df, input_folder, output_folder, file_column)


print(df_updated.head())

In [2]:
from PIL import Image, ImageDraw
import os
import matplotlib.pyplot as plt

# Define the input and output directories
input_dir = 'D:/rximage/image/images/vertical'
output_dir  = 'D:/rximage/image/images/split'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Loop through all the images in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.jpg') or filename.endswith('.png'):
        # Open the image
        img = Image.open(os.path.join(input_dir, filename))
        
        # Get dimensions
        width, height = img.size
        
        # Calculate the midpoint for splitting
        midpoint = height // 2
        
        # Draw a line at the boundary for visual confirmation
        img_with_line = img.copy()
        draw = ImageDraw.Draw(img_with_line)
        draw.line([(0, midpoint), (width, midpoint)], fill="red", width=3)
        
        # Split the image into top and bottom
        top = img.crop((0, 0, width, midpoint))
        bottom = img.crop((0, midpoint, width, height))
        
        # Save the split images with modified filenames
        base_name = os.path.splitext(filename)[0]
        top.save(os.path.join(output_dir, f"{base_name}_top.jpg"))
        bottom.save(os.path.join(output_dir, f"{base_name}_bottom.jpg"))

        print(f"Processed and saved: {filename}")

print("All images have been split and saved.")

Processed and saved: 00002-3228-30_RXNAVIMAGE10_391E1C80.jpg
Processed and saved: 00002-3229-30_RXNAVIMAGE10_3E1E1F50.jpg
Processed and saved: 00002-3235-60_RXNAVIMAGE10_1B158D9C.jpg
Processed and saved: 00002-3238-30_RXNAVIMAGE10_361E1B30.jpg
Processed and saved: 00002-3239-30_RXNAVIMAGE10_3F1E1F80.jpg
Processed and saved: 00002-3240-30_RXNAVIMAGE10_A91354BA.jpg
Processed and saved: 00002-3250-30_RXNAVIMAGE10_431E21C1.jpg
Processed and saved: 00002-3251-30_RXNAVIMAGE10_451E2281.jpg
Processed and saved: 00002-3270-30_RXNAVIMAGE10_A91354EA.jpg
Processed and saved: 00002-4165-02_RXNAVIMAGE10_EF3AF7C7.jpg
Processed and saved: 00002-4462-30_RXNAVIMAGE10_B215591A.jpg
Processed and saved: 00002-4463-30_RXNAVIMAGE10_B5155ACA.jpg
Processed and saved: 00002-4464-30_RXNAVIMAGE10_B8155C1A.jpg
Processed and saved: 00002-5123-30_RXNAVIMAGE10_E848F457.jpg
Processed and saved: 00003-0894-21_RXNAVIMAGE10_3D491EB8.jpg
Processed and saved: 00003-4222-16_RXNAVIMAGE10_F215F93F.jpg
Processed and saved: 000