In [None]:
# Character set analysis
all_chars_prescription = set()
all_chars_lab = set()

for txt_file in prescriptions_texts:
    with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
        all_chars_prescription.update(text)

for txt_file in lab_report_texts:
    with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
        all_chars_lab.update(text)

all_chars = all_chars_prescription | all_chars_lab

print(f"Unique characters in prescriptions: {len(all_chars_prescription)}")
print(f"Unique characters in lab reports: {len(all_chars_lab)}")
print(f"Total unique characters: {len(all_chars)}")

# Display character set
print(f"\nCharacter set: {''.join(sorted(all_chars))}")

In [None]:
# Plot text length distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

if prescription_lengths:
    axes[0].hist(prescription_lengths, bins=30, edgecolor='black', alpha=0.7, color='blue')
    axes[0].set_xlabel('Text Length')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Prescription Text Length Distribution')
    axes[0].grid(True, alpha=0.3)

if lab_lengths:
    axes[1].hist(lab_lengths, bins=30, edgecolor='black', alpha=0.7, color='green')
    axes[1].set_xlabel('Text Length')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Lab Report Text Length Distribution')
    axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Distribution plotted successfully!")

In [None]:
# Dataset Statistics
print("="*60)
print("DATASET STATISTICS")
print("="*60)

# Count all files
prescriptions_texts = list(prescriptions_output.glob('*.txt')) if prescriptions_output.exists() else []
lab_report_texts = list(lab_output.glob('*.txt')) if lab_output.exists() else []

print(f"\nPrescription text files: {len(prescriptions_texts)}")
print(f"Lab report text files: {len(lab_report_texts)}")
print(f"Total text files: {len(prescriptions_texts) + len(lab_report_texts)}")

# Text length statistics
prescription_lengths = []
for txt_file in prescriptions_texts:
    with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read().strip()
        prescription_lengths.append(len(text))

lab_lengths = []
for txt_file in lab_report_texts:
    with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read().strip()
        lab_lengths.append(len(text))

print("\n--- Prescription Text Statistics ---")
if prescription_lengths:
    print(f"Min length: {min(prescription_lengths)}")
    print(f"Max length: {max(prescription_lengths)}")
    print(f"Mean length: {np.mean(prescription_lengths):.2f}")
    print(f"Median length: {np.median(prescription_lengths):.2f}")

print("\n--- Lab Report Text Statistics ---")
if lab_lengths:
    print(f"Min length: {min(lab_lengths)}")
    print(f"Max length: {max(lab_lengths)}")
    print(f"Mean length: {np.mean(lab_lengths):.2f}")
    print(f"Median length: {np.median(lab_lengths):.2f}")

In [None]:
# Load lab reports data
lab_output = lab_reports_dir / 'Output'
lab_texts = []

if lab_output.exists():
    txt_files = sorted(list(lab_output.glob('*.txt')))
    print(f"Found {len(txt_files)} lab report text files")
    
    # Load text samples
    for txt_file in txt_files[:5]:
        with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read().strip()
            lab_texts.append({
                'file': txt_file.name,
                'text': text,
                'length': len(text)
            })
    
    print("\nSample lab report texts:")
    for item in lab_texts[:2]:
        print(f"\nFile: {item['file']}")
        print(f"Length: {item['length']}")
        print(f"Text: {item['text'][:100]}...")

In [None]:
# Load prescriptions data
prescriptions_output = prescriptions_dir / 'Output'
prescriptions_input = prescriptions_dir / 'Input'

prescription_texts = []
if prescriptions_output.exists():
    txt_files = sorted(list(prescriptions_output.glob('*.txt')))
    print(f"Found {len(txt_files)} prescription text files")
    
    # Load text samples
    for txt_file in txt_files[:5]:  # First 5 for preview
        with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read().strip()
            prescription_texts.append({
                'file': txt_file.name,
                'text': text,
                'length': len(text)
            })
    
    print("\nSample prescription texts:")
    for item in prescription_texts[:2]:
        print(f"\nFile: {item['file']}")
        print(f"Length: {item['length']}")
        print(f"Text: {item['text'][:100]}...")

In [None]:
# Define data paths
prescriptions_dir = Path(base_path) / 'data' / 'data1'
lab_reports_dir = Path(base_path) / 'data' / 'lbmaske'

print(f"Prescriptions directory: {prescriptions_dir}")
print(f"Lab reports directory: {lab_reports_dir}")

# Check if directories exist
print(f"\nPrescriptions dir exists: {prescriptions_dir.exists()}")
print(f"Lab reports dir exists: {lab_reports_dir.exists()}")

## 2. Data Loading and Exploration

In [None]:
# Import libraries
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import warnings

warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

In [None]:
# Install required packages (Colab only)
import subprocess
import sys

packages = [
    'opencv-python',
    'Pillow',
    'numpy',
    'pandas',
    'matplotlib',
    'seaborn'
]

if 'google.colab' in sys.modules:
    for package in packages:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package])
    print("Packages installed successfully!")

In [None]:
# Mount Google Drive (if running on Colab)
import sys
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    base_path = '/content/drive/MyDrive/new_EMR'
else:
    base_path = '..'

print(f"Base path: {base_path}")

## 1. Setup and Installation

# OCR Model for EMR System - Data Exploration
## Analyzing Prescriptions and Lab Reports Dataset

This notebook explores and analyzes the medical documents dataset for OCR model training.