In [19]:
# üì¶ Imports and environment setup
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import fitz  # PyMuPDF


# üóÇ Define base directories
BASE_DIR = os.path.expanduser("~/Sterilization-data-analysis")
DATA_DIR = os.path.join(BASE_DIR, "data")
PROCESSED_DIR = os.path.join(BASE_DIR, "processed")
RESULTS_DIR = os.path.join(BASE_DIR, "results")

# Create folders if missing
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Devices
devices = ["StatimA", "StatimB", "Ritter1", "Ritter2"]

print("Environment ready. Directories:")
print(f"DATA_DIR: {DATA_DIR}")
print(f"PROCESSED_DIR: {PROCESSED_DIR}")
print(f"RESULTS_DIR: {RESULTS_DIR}")

Environment ready. Directories:
DATA_DIR: /home/ben/Sterilization-data-analysis/data
PROCESSED_DIR: /home/ben/Sterilization-data-analysis/processed
RESULTS_DIR: /home/ben/Sterilization-data-analysis/results


In [None]:
import os
import re
import pandas as pd

STATIM_DIRS = ["StatimA", "StatimB"]
PROCESSED_DIR = "/home/ben/data-science-portfolio/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

def process_statim(device):
    data_dir = f"/home/ben/data-science-portfolio/data/{device}"
    all_records = []

    for root, _, files in os.walk(data_dir):
        for f in files:
            if f.lower().endswith(".txt"):
                path = os.path.join(root, f)
                with open(path, 'r') as file:
                    text = file.read()
                record = {"Filename": f, "Device": device}

                # Extract temperatures and pressures
                temps = [float(t) for t in re.findall(r'(\d+\.?\d*) C', text)]
                pressures = [float(p) for p in re.findall(r'(\d+)\s*kPa', text)]

                record['MaxTemp'] = max(temps) if temps else None
                record['MeanTemp'] = sum(temps)/len(temps) if temps else None
                record['MaxPressure'] = max(pressures) if pressures else None
                record['MeanPressure'] = sum(pressures)/len(pressures) if pressures else None

                # Extract total cycle duration in minutes (example: sum of phases)
                phases = ['Conditioning','Pressurizing','Sterilizing','Venting','Air Drying']
                total_min = 0
                for ph in phases:
                    match = re.search(rf'{ph}\s+(\d+):(\d+)', text)
                    if match:
                        total_min += int(match.group(1)) + int(match.group(2))/60
                record['TotalCycle_min'] = total_min if total_min>0 else None

                all_records.append(record)

    df = pd.DataFrame(all_records)
    csv_path = os.path.join(PROCESSED_DIR, f"{device}_cycles.csv")
    df.to_csv(csv_path, index=False)
    print(f"{device}: {len(df)} cycles processed. CSV saved to {csv_path}")

# Run Statims
for device in STATIM_DIRS:
    process_statim(device)

In [None]:
import os
import re
import pandas as pd
import pdfplumber  # or PyMuPDF if pdfplumber fails

RITTER_DIRS = ["Ritter1", "Ritter2"]
PROCESSED_DIR = "/home/ben/data-science-portfolio/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

def process_ritter(device):
    data_dir = f"/home/ben/data-science-portfolio/data/{device}"
    all_records = []

    for root, _, files in os.walk(data_dir):
        for f in files:
            if f.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, f)
                with pdfplumber.open(pdf_path) as pdf:
                    text = ""
                    for page in pdf.pages:
                        text += page.extract_text()

                record = {"Filename": f, "Device": device}

                # Extract phases, temperatures, pressures
                temps = [float(t) for t in re.findall(r'(\d+\.?\d*) F', text)]
                record['MaxTemp'] = max(temps) if temps else None
                record['MeanTemp'] = sum(temps)/len(temps) if temps else None

                pressures = [float(p) for p in re.findall(r'(\d+\.?\d*)', text)]  # adjust pattern
                record['MaxPressure'] = max(pressures) if pressures else None
                record['MeanPressure'] = sum(pressures)/len(pressures) if pressures else None

                # Total cycle time from STERILIZING, VENTING, DRYING etc.
                match = re.search(r'TOTAL CYCLE:\s*(\d+):(\d+):(\d+)', text)
                if match:
                    h, m, s = map(int, match.groups())
                    record['TotalCycle_min'] = h*60 + m + s/60

                all_records.append(record)

    df = pd.DataFrame(all_records)
    csv_path = os.path.join(PROCESSED_DIR, f"{device}_cycles.csv")
    df.to_csv(csv_path, index=False)
    print(f"{device}: {len(df)} cycles processed. CSV saved to {csv_path}")

# Run all Ritter devices
for device in RITTER_DIRS:
    process_ritter(device)
    

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Paths ---
PROCESSED_DIR = "/home/ben/data-science-portfolio/processed"

csv_files = [
    "StatimA_cycles.csv",
    "StatimB_cycles.csv",
    "Ritter1_cycles.csv",
    "Ritter2_cycles.csv"
]

dfs = []

for f in csv_files:
    path = os.path.join(PROCESSED_DIR, f)
    if os.path.exists(path):
        df = pd.read_csv(path)
        # Add Device column if missing
        if 'Device' not in df.columns:
            df['Device'] = f.split('_')[0]  # e.g., StatimA, Ritter1
        dfs.append(df)

if not dfs:
    raise RuntimeError("No CSV files found in processed folder.")

all_data = pd.concat(dfs, ignore_index=True)

# --- Attempt to calculate efficiency if columns exist ---
if 'Sterilizing' in all_data.columns and 'TotalCycle_min' in all_data.columns:
    all_data['Efficiency'] = all_data['Sterilizing'] / all_data['TotalCycle_min']
elif 'Sterilizing' in all_data.columns:
    # Fallback if TotalCycle_min missing: use Sterilizing only
    all_data['Efficiency'] = all_data['Sterilizing']

# --- Summary table ---
summary_cols = ['Device']
metrics = ['MeanTemp', 'MaxTemp', 'MeanPressure', 'MaxPressure', 'Efficiency']

for col in metrics:
    if col in all_data.columns:
        summary_cols.append(col)

summary = all_data.groupby('Device')[summary_cols[1:]].mean().reset_index()
print("=== Device Performance Summary ===")
display(summary)

# --- Plots ---
sns.set(style="whitegrid")

# 1Ô∏è‚É£ Total cycles per device
plt.figure(figsize=(6,4))
sns.countplot(data=all_data, x="Device", order=all_data['Device'].unique())
plt.title("Number of Cycles per Device")
plt.ylabel("Number of Cycles")
plt.show()

# 2Ô∏è‚É£ Plots for available metrics
for metric in metrics:
    if metric in all_data.columns:
        plt.figure(figsize=(6,4))
        sns.barplot(data=summary, x='Device', y=metric, palette="coolwarm")
        plt.title(f"Mean {metric} per Device")
        plt.show()