In [4]:

path='/content/easyocr_output.txt'
with open(path, "r", encoding="utf-8") as f:
    data = f.read()

# Split by file blocks using "---"
blocks = data.split('--- ')
print("Total blocks found:", len(blocks) - 1)



Total blocks found: 129


In [5]:
# Step 2: Extract filename and text from each block

ocr_data = []

# Skip the first empty split
for block in blocks[1:]:
    try:
        # First line is the filename (e.g., "1.jpg ---\nText")
        lines = block.strip().split('\n')
        filename = lines[0].strip().replace('---', '').strip()
        text = ' '.join(lines[1:]).strip()

        ocr_data.append({
            "filename": filename,
            "text": text
        })
    except Exception as e:
        print("Error parsing block:", e)

print(f" Parsed {len(ocr_data)} entries.")
print("\n Sample:")
print(ocr_data[0])


 Parsed 129 entries.

 Sample:
{'filename': '1.jpg', 'text': 'Dr B;, Who Farmstrcct 12 Kirkvillc tel, 3876 uate Lav 1994 ttriw 0.12 m% theoh ~ M0,7 44 hlt Lm Ms/Mr Pehuiof 30 address; 7v Ye+ Agc:'}


In [6]:
import re

# Define regex patterns
doctor_pattern = r'(Dr\.?\s+[A-Z][a-zA-Z]*)'
medicine_pattern = r'\b[A-Z][a-z]{3,}\b'  # crude pattern for capitalized words (will refine)
dosage_pattern = r'\b\d{1,4}\s?(mg|ml|tab|tablet|caps|g|mcg)\b'
age_pattern = r'(Age[:\s]*\d{1,3})'
date_pattern = r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'

# List to hold structured data
structured_data = []

for entry in ocr_data:
    text = entry["text"]

    # Extract using regex
    doctor_match = re.findall(doctor_pattern, text)
    dosage_match = re.findall(dosage_pattern, text)
    age_match = re.findall(age_pattern, text)
    date_match = re.findall(date_pattern, text)

    # Extract all words starting with capital letters (rough guess for medicine names)
    meds = re.findall(medicine_pattern, text)

    structured_data.append({
        "filename": entry["filename"],
        "doctor_name": doctor_match[0] if doctor_match else "",
        "medicines": meds[:3],  # take top 3 medicine-like words
        "dosage": ', '.join(dosage_match),
        "age": age_match[0] if age_match else "",
        "date": date_match[0] if date_match else ""
    })

print("✅ Extracted structured fields for", len(structured_data), "prescriptions")
print("\n🔍 Sample:")
print(structured_data[0])


✅ Extracted structured fields for 129 prescriptions

🔍 Sample:
{'filename': '1.jpg', 'doctor_name': 'Dr B', 'medicines': ['Farmstrcct', 'Kirkvillc', 'Pehuiof'], 'dosage': '', 'age': '', 'date': ''}


In [7]:
import csv
import json
from google.colab import files

# Save as CSV
csv_file = "structured_prescription_data.csv"
csv_fields = ["filename", "doctor_name", "medicines", "dosage", "age", "date"]

with open(csv_file, mode="w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=csv_fields)
    writer.writeheader()
    for row in structured_data:
        # Convert list of medicines to a comma-separated string
        row["medicines"] = ', '.join(row["medicines"])
        writer.writerow(row)

# Also save as JSON (optional)
with open("structured_prescription_data.json", "w") as jf:
    json.dump(structured_data, jf, indent=2)

# Download files
files.download(csv_file)
files.download("structured_prescription_data.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>