<a href="https://colab.research.google.com/github/Leorasaharia/hospital-schema/blob/main/hdfs_mapreducer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# synthetic hospital device logs (like files stored in HDFS)

device_logs = [
    "ECG ERROR",
    "MRI OK",
    "ECG ERROR",
    "VENTILATOR ERROR",
    "ECG OK",
    "MRI ERROR",
    "VENTILATOR ERROR",
    "VENTILATOR ERROR",
    "XRAY OK",
    "XRAY ERROR",
    "ECG ERROR"
]

print("HDFS-like Input Data:")
for line in device_logs:
    print(line)

HDFS-like Input Data:
ECG ERROR
MRI OK
ECG ERROR
VENTILATOR ERROR
ECG OK
MRI ERROR
VENTILATOR ERROR
VENTILATOR ERROR
XRAY OK
XRAY ERROR
ECG ERROR


In [2]:
# mapper: emits (device, 1) only for ERROR events

def mapper(logs):
    mapped = []
    for line in logs:
        device, status = line.split()
        if status == "ERROR":
            mapped.append((device, 1))
    return mapped

In [3]:
mapped_output = mapper(device_logs)

print("Mapper Output:")
for item in mapped_output:
    print(item)

Mapper Output:
('ECG', 1)
('ECG', 1)
('VENTILATOR', 1)
('MRI', 1)
('VENTILATOR', 1)
('VENTILATOR', 1)
('XRAY', 1)
('ECG', 1)


In [4]:
# shuffle phase: group values by key

def shuffle(mapped_data):
    shuffled = {}
    for device, count in mapped_data:
        if device not in shuffled:
            shuffled[device] = []
        shuffled[device].append(count)
    return shuffled

In [5]:
shuffled_output = shuffle(mapped_output)

print("Shuffled Output:")
for k, v in shuffled_output.items():
    print(k, ":", v)

Shuffled Output:
ECG : [1, 1, 1]
VENTILATOR : [1, 1, 1]
MRI : [1]
XRAY : [1]


In [6]:
# reducer: sum error counts per device

def reducer(shuffled_data):
    reduced = {}
    for device, counts in shuffled_data.items():
        reduced[device] = sum(counts)
    return reduced

In [7]:
final_output = reducer(shuffled_output)

print("Final Reduced Output (Device → Error Count):")
for device, count in final_output.items():
    print(device, "→", count)

Final Reduced Output (Device → Error Count):
ECG → 3
VENTILATOR → 3
MRI → 1
XRAY → 1


In [8]:
# identify most critical device

critical_device = max(final_output, key=final_output.get)

print("Most Critical Device:")
print(critical_device, "with", final_output[critical_device], "errors")

Most Critical Device:
ECG with 3 errors
