In [1]:
import pandas as pd
df = pd.read_csv("../data/pillbox_train_test.csv")

In [2]:
# Randomly sample 500 rows with seed 42
df_sample = df.sample(n=500, random_state=42)

# Get the remaining rows
df_remaining = df.drop(df_sample.index)

print(f"Sample size: {len(df_sample)}")
print(f"Remaining size: {len(df_remaining)}")

Sample size: 500
Remaining size: 7335


In [6]:
df_sample.head(3)

Unnamed: 0.1,Unnamed: 0,ID,splcolor_text,splshape_text,product_code,medicine_name,dosage_form,splimage,splimprint,original_name,num_colors,num_imprints,image_path,shape_label,color_label
1142,2645,23254,BROWN,ROUND,51293-802,PHENAZOPYRIDINE HYDROCHLORIDE,C42998,51293-0802-01_NLMIMAGE10_814140FA,AN;2,51293-0802-01_NLMIMAGE10_814140FA.jpg,1,2.0,data/pillbox_production_images_full_202008/512...,10,2
6978,1551,35079,BLUE,ROUND,0378-0047,METOPROLOL TARTRATE,C42931,003780047,M;47,003780047.jpg,1,2.0,data/pillbox_production_images_full_202008/003...,10,1
3765,1001,42579,WHITE,ROUND,0054-0052,ZIDOVUDINE,C42998,000540052,54;777,000540052.jpg,1,2.0,data/pillbox_production_images_full_202008/000...,10,10


In [4]:
df_sample.to_csv("manual_label.csv")
df_remaining.to_csv("auto_label.csv")

## Merging Json files (after training and vetting)

In [4]:
import json

# Load the file with duplicates
with open('labels/imprint_labels_vetted_duplicates.json', 'r') as f:
    data = json.load(f)

# Deduplicate by image name (keeps first occurrence)
seen = set()
deduplicated = []

for item in data:
    if item['image'] not in seen:
        seen.add(item['image'])
        deduplicated.append(item)

# Sort by image name for consistency
deduplicated = sorted(deduplicated, key=lambda x: x['image'])

# Save deduplicated version
with open('labels/imprint_labels_vetted.json', 'w') as f:
    json.dump(deduplicated, f, indent=2)

print(f"Deduplication complete:")
print(f"  Original entries: {len(data)}")
print(f"  Unique images: {len(deduplicated)}")
print(f"  Duplicates removed: {len(data) - len(deduplicated)}")

Deduplication complete:
  Original entries: 773
  Unique images: 342
  Duplicates removed: 431


In [5]:
import json

# Load both JSON files
with open('labels/auto_labels_cleaned.json', 'r') as f:
    auto_labels = json.load(f)

with open('labels/imprint_labels_vetted.json', 'r') as f:
    vetted_labels = json.load(f)

# Create a dictionary to track images by filename
images_dict = {}

# Add vetted labels first (higher priority)
for item in vetted_labels:
    images_dict[item['image']] = item

# Add auto labels (only if image not already in vetted set)
# Remove confidence field for consistency
for item in auto_labels:
    if item['image'] not in images_dict:
        cleaned_item = {
            'image': item['image'],
            'labels': [{k: v for k, v in label.items() if k != 'confidence'}
                       for label in item['labels']]
        }
        images_dict[item['image']] = cleaned_item

# Convert back to list and sort by image name
merged = sorted(images_dict.values(), key=lambda x: x['image'])

# Save merged result
with open('labels/imprint_labels_expanded.json', 'w') as f:
    json.dump(merged, f, indent=2)

print(f"Merged labels created:")
print(f"  Vetted labels: {len(vetted_labels)}")
print(f"  Auto labels: {len(auto_labels)}")
print(f"  Total unique images: {len(merged)}")
print(f"  New images from auto labels: {len(merged) - len(vetted_labels)}")


Merged labels created:
  Vetted labels: 342
  Auto labels: 813
  Total unique images: 1145
  New images from auto labels: 803


In [7]:
import json

with open('labels/imprint_labels_expanded.json') as f:
    data = json.load(f)

# Check for duplicates
image_names = [item['image'] for item in data]
unique_images = set(image_names)

print(f"Total entries: {len(data)}")
print(f"Unique images: {len(unique_images)}")
print(f"Duplicates: {len(data) - len(unique_images)}")
print(f"Total character labels: {sum(len(item['labels']) for item in data)}")

Total entries: 1145
Unique images: 1145
Duplicates: 0
Total character labels: 5939


# Create final dataset split

In [10]:
# Read the CSV file
import pandas as pd
df = pd.read_csv('../data/pillbox_train_test.csv')

# Read the JSON file
with open('labels/imprint_labels_expanded.json', 'r') as f:
    labels_data = json.load(f)

# Extract image filenames from JSON
labeled_images = {item['image'] for item in labels_data}

# Filter the dataframe to keep only rows where original_name is in the labeled images
df_filtered = df[df['original_name'].isin(labeled_images)]

# Save the filtered dataframe
df_filtered.to_csv('../data/pillbox_train_test_F.csv', index=False)

# Print statistics
print(f"Original dataset size: {len(df)}")
print(f"Number of labeled images: {len(labeled_images)}")
print(f"Filtered dataset size: {len(df_filtered)}")
print(f"Rows removed: {len(df) - len(df_filtered)}")

Original dataset size: 7835
Number of labeled images: 1145
Filtered dataset size: 1377
Rows removed: 6458


In [11]:
import pandas as pd
import json

# Read the CSV file
df = pd.read_csv('../data/pillbox_train_test.csv')

# Read the JSON file
with open('labels/imprint_labels_expanded.json', 'r') as f:
    labels_data = json.load(f)

# Extract image filenames from JSON
labeled_images = {item['image'] for item in labels_data}

# Filter the dataframe to keep only rows where original_name is in the labeled images
df_filtered = df[df['original_name'].isin(labeled_images)]

# Check for duplicate image names
duplicates = df_filtered['original_name'].value_counts()
images_with_duplicates = duplicates[duplicates > 1]

print(f"Original dataset size: {len(df)}")
print(f"Number of labeled images: {len(labeled_images)}")
print(f"Filtered dataset size: {len(df_filtered)}")
print(f"Number of unique images in filtered data: {df_filtered['original_name'].nunique()}")
print(f"\nImages with multiple rows: {len(images_with_duplicates)}")
if len(images_with_duplicates) > 0:
    print(f"\nTop 10 images with most duplicates:")
    print(images_with_duplicates.head(10))

# Save the filtered dataframe
df_filtered.to_csv('../data/pillbox_train_test_F.csv', index=False)

Original dataset size: 7835
Number of labeled images: 1145
Filtered dataset size: 1377
Number of unique images in filtered data: 1145

Images with multiple rows: 76

Top 10 images with most duplicates:
original_name
50111064801.jpg    13
00093738656.jpg    13
50111033401.jpg    13
50111078766.jpg    12
00172572860.jpg    12
00093227534.jpg    10
50111064701.jpg     9
00093171201.jpg     8
00093221098.jpg     8
00093738456.jpg     7
Name: count, dtype: int64


In [16]:
df_filtered['ID'].nunique()

1377

In [19]:
# Read the cleaned CSV file
df_cleaned = pd.read_csv('../data/pillbox_cleaned.csv')

# Get the IDs from the filtered dataset
filtered_ids = df_filtered['ID'].values

# Filter the cleaned dataframe to exclude these IDs (keep everything NOT in filtered_ids)
df_heldout = df_cleaned[~df_cleaned['ID'].isin(filtered_ids)]

# Save the heldout dataset
df_heldout.to_csv('../data/pillbox_heldout_F.csv', index=False)

# Print statistics
print(f"Filtered dataset size: {len(df_filtered)}")
print(f"Cleaned dataset size: {len(df_cleaned)}")
print(f"Heldout dataset size: {len(df_heldout)}")
print(f"Rows removed from cleaned: {len(df_cleaned) - len(df_heldout)}")

Filtered dataset size: 1377
Cleaned dataset size: 83925
Heldout dataset size: 82548
Rows removed from cleaned: 1377
