In [1]:
!pip install pandas numpy opencv-python pillow tqdm


Defaulting to user installation because normal site-packages is not writeable


In [1]:
!pip install pyarrow


Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import os
import cv2
from sklearn.preprocessing import LabelEncoder
import pytesseract
from PIL import Image
import matplotlib.pyplot as plt
import re


In [3]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


In [5]:
csv_path = "patient.csv"   # This file must be in SAME folder as your notebook
df = pd.read_csv(csv_path)

print("Patient Metadata Loaded Successfully!")
df.head()


Patient Metadata Loaded Successfully!


Unnamed: 0,Patient_ID,Age,Gender,Stroke_Type,Num_Images,Folder_Path,Date_of_Scan,Ward_ID
0,Patient_001,65,F,Normal,120,Stroke_classification/Normal/Patient_001/,07-08-2025,2
1,Patient_002,41,F,Normal,60,Stroke_classification/Normal/Patient_002/,16-05-2021,2
2,Patient_003,64,M,Haemorrhagic,66,Stroke_classification/Haemorrhagic/Patient_003/,08-04-2025,1
3,Patient_004,62,M,Haemorrhagic,102,Stroke_classification/Haemorrhagic/Patient_004/,23-05-2023,1
4,Patient_005,77,M,Ischemic,128,Stroke_classification/Ischemic/Patient_005/,12-11-2021,3


In [6]:
def clean_text(x):
    if isinstance(x, str):
        x = x.lower()
        x = re.sub(r'[^a-z0-9 ]+', '', x)
    return x

df["Gender"] = df["Gender"].apply(clean_text)
df["Stroke_Type"] = df["Stroke_Type"].apply(clean_text)
df["Folder_Path"] = df["Folder_Path"].apply(clean_text)

print("Cleaned Metadata:")
df.head()


Cleaned Metadata:


Unnamed: 0,Patient_ID,Age,Gender,Stroke_Type,Num_Images,Folder_Path,Date_of_Scan,Ward_ID
0,Patient_001,65,f,normal,120,strokeclassificationnormalpatient001,07-08-2025,2
1,Patient_002,41,f,normal,60,strokeclassificationnormalpatient002,16-05-2021,2
2,Patient_003,64,m,haemorrhagic,66,strokeclassificationhaemorrhagicpatient003,08-04-2025,1
3,Patient_004,62,m,haemorrhagic,102,strokeclassificationhaemorrhagicpatient004,23-05-2023,1
4,Patient_005,77,m,ischemic,128,strokeclassificationischemicpatient005,12-11-2021,3


In [7]:
df["Stroke_Type"] = df["Stroke_Type"].replace({
    "haemorrhagic": "Haemorrhagic",
    "hemorrhagic": "Haemorrhagic",
    "ischemic": "Ischemic",
    "normal": "Normal"
})


In [8]:
image_dir = "Stroke_classification"

classes = ["Haemorrhagic", "Ischemic", "Normal"]

images = []
labels = []
IMG_SIZE = 224

for cls in classes:
    path = os.path.join(image_dir, cls)

    if not os.path.exists(path):
        print(f"❌ Missing folder: {path}")
        continue

    for img_name in os.listdir(path):
        img_path = os.path.join(path, img_name)

        try:
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))

            images.append(img)
            labels.append(cls)
        except:
            print("Error loading:", img_path)

print("Total images loaded:", len(images))


Total images loaded: 297


In [9]:
X = np.array(images) / 255.0

le = LabelEncoder()
y = le.fit_transform(labels)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Label Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


X shape: (297, 224, 224, 3)
y shape: (297,)
Label Mapping: {'Haemorrhagic': 0, 'Ischemic': 1, 'Normal': 2}


In [11]:
print("\n----- FINAL OUTPUT SUMMARY -----")
print("✔ Patient metadata cleaned")
print("✔ Images loaded:", len(X))
print("✔ Labels encoded:", set(labels))
print("✔ CNN-ready dataset prepared")
print("✔ OCR extraction available")
print("✔ Ready for CNN / ViT / GenAI training")



----- FINAL OUTPUT SUMMARY -----
✔ Patient metadata cleaned
✔ Images loaded: 297
✔ Labels encoded: {'Normal', 'Ischemic', 'Haemorrhagic'}
✔ CNN-ready dataset prepared
✔ OCR extraction available
✔ Ready for CNN / ViT / GenAI training
