Data Loader

In [1]:
import os
import pandas as pd

def data_loader(base_path:str) -> pd.DataFrame:
    folders = ['normal', 'lung_opacity', 'covid', 'pneumonia']
    data = []

    for folder in folders:
        folder_path = os.path.join(base_path, folder)
        for filename in os.listdir(folder_path):
            if filename.endswith('.jpg') or filename.endswith('.png'):
                file_path = os.path.join(folder_path, filename)
                data.append((file_path, folder))

    df_tuples = pd.DataFrame(data, columns=['image_path', 'label'])

    return df_tuples

Data pre-processing

In [2]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img

def data_preprocessor(df):
    def preproc(image_path, target_size=(64, 64)):
        image = load_img(image_path, target_size=target_size, color_mode='grayscale')  # Resize and convert to grayscale
        image = img_to_array(image)
        image = image.flatten()  # Flatten the image
        image /= 255.0  # Normalize pixel values to [0, 1]
        return image

    df['features'] = df['image_path'].apply(preproc)
    return df

Split Data

In [3]:
import numpy as np

from sklearn.model_selection import train_test_split

def split_data(df: pd.DataFrame):
    # Convert the features into a numpy array and stack them into a matrix
    X = np.stack(df['features'].values)

    # Encode the labels as integers
    y = df['label'].astype('category').cat.codes

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

Train Model

In [4]:
from sklearn.ensemble import RandomForestClassifier


def train_model(X_train, y_train):
    
    # Initialize the Logistic Regression model
    model = RandomForestClassifier(n_estimators=100, random_state=42)  # Increase max_iter if needed for convergence
    # Train the model on the training data
    model.fit(X_train, y_train)

    return model

Prediction of model

In [5]:
from sklearn.metrics import accuracy_score, classification_report


def predict_model(df_preprocessed, X_test, y_test, model):
    # Predict the labels for the test set
    y_pred = model.predict(X_test)

    # Calculate the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Generate a classification report
    report = classification_report(y_test, y_pred, target_names=df_preprocessed['label'].astype('category').cat.categories)
    print(report)

    return report

Execute

In [6]:
df_tuples = data_loader(base_path='../data')
df_preprocessed = data_preprocessor(df_tuples)
X_train, X_test, y_train, y_test = split_data(df_preprocessed)
model = train_model(X_train, y_train)
report = predict_model(df_preprocessed, X_test, y_test, model)
print(report)

Accuracy: 83.44%
              precision    recall  f1-score   support

       covid       0.90      0.69      0.78       724
lung_opacity       0.79      0.77      0.78      1191
      normal       0.83      0.92      0.87      2056
   pneumonia       0.94      0.85      0.90       262

    accuracy                           0.83      4233
   macro avg       0.87      0.81      0.83      4233
weighted avg       0.84      0.83      0.83      4233

              precision    recall  f1-score   support

       covid       0.90      0.69      0.78       724
lung_opacity       0.79      0.77      0.78      1191
      normal       0.83      0.92      0.87      2056
   pneumonia       0.94      0.85      0.90       262

    accuracy                           0.83      4233
   macro avg       0.87      0.81      0.83      4233
weighted avg       0.84      0.83      0.83      4233

