<a href="https://colab.research.google.com/github/Joha1262/Pneumonia-Disease-using-MobileNet-Random-Forest/blob/main/Pneumonia_Disease_using_MobileNet_%26_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Dataset Collection
# Image dataset: Kaggle Chest X-ray Pneumonia
# Tabular dataset: Synthetic clinical data

import kagglehub
import os
import pandas as pd
import numpy as np

# Download Pneumonia dataset
path = kagglehub.dataset_download("paultimothymooney/chest-xray-pneumonia")
print("Dataset downloaded to:", path)

# Verify folder structure
base_dir = os.path.join(path, "chest_xray")
print(os.listdir(base_dir))

# Create a simple synthetic tabular dataset
np.random.seed(42)
n_samples = 800
df_tabular = pd.DataFrame({
    "Age": np.random.randint(1, 90, n_samples),
    "Gender": np.random.choice(["Male", "Female"], n_samples),
    "Fever": np.random.choice(["Yes", "No"], n_samples),
    "Cough_Duration": np.random.randint(0, 10, n_samples),
    "Chest_Pain": np.random.choice(["Yes", "No"], n_samples),
    "Pneumonia": np.random.choice([0, 1], n_samples)
})
df_tabular.to_csv("pneumonia_tabular.csv", index=False)
print("Tabular data created:", df_tabular.shape)
df_tabular.head()


Using Colab cache for faster access to the 'chest-xray-pneumonia' dataset.
Dataset downloaded to: /kaggle/input/chest-xray-pneumonia
['chest_xray', '__MACOSX', 'val', 'test', 'train']
Tabular data created: (800, 6)


Unnamed: 0,Age,Gender,Fever,Cough_Duration,Chest_Pain,Pneumonia
0,52,Male,Yes,0,No,0
1,15,Male,Yes,8,No,1
2,72,Male,Yes,5,No,1
3,61,Female,No,3,No,0
4,21,Female,No,8,No,1


In [2]:
# Step 2: Data Preprocessing

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Image preprocessing
train_path = os.path.join(base_dir, "train")
test_path = os.path.join(base_dir, "test")

img_size = (224, 224)

train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.1
)

train_gen = train_datagen.flow_from_directory(
    train_path,
    target_size=img_size,
    batch_size=32,
    subset='training',
    class_mode='binary'
)

val_gen = train_datagen.flow_from_directory(
    train_path,
    target_size=img_size,
    batch_size=32,
    subset='validation',
    class_mode='binary'
)

test_gen = ImageDataGenerator(rescale=1./255).flow_from_directory(
    test_path,
    target_size=img_size,
    batch_size=32,
    class_mode='binary'
)

# Tabular preprocessing
df = pd.read_csv("pneumonia_tabular.csv")
df = df.dropna()

# Encode categorical columns
cat_cols = ["Gender", "Fever", "Chest_Pain"]
df[cat_cols] = df[cat_cols].apply(LabelEncoder().fit_transform)

# Split features and labels
X = df.drop("Pneumonia", axis=1)
y = df["Pneumonia"]

# Normalize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print("Tabular train/test shapes:", X_train.shape, X_test.shape)


Found 4695 images belonging to 2 classes.
Found 521 images belonging to 2 classes.
Found 624 images belonging to 2 classes.
Tabular train/test shapes: (640, 5) (160, 5)


In [3]:
# Step 3: Model Training

from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, models
from sklearn.ensemble import RandomForestClassifier

# --- Image model (Transfer Learning CNN)
cnn_base = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224,224,3))
cnn_base.trainable = False

cnn_model = models.Sequential([
    cnn_base,
    layers.GlobalAveragePooling2D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.summary()

# Train CNN
cnn_history = cnn_model.fit(train_gen, validation_data=val_gen, epochs=3)

# --- Tabular model (Random Forest) ---
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
print("Tabular model trained.")


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


  self._warn_if_super_not_called()


Epoch 1/3
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 793ms/step - accuracy: 0.8588 - loss: 0.3342 - val_accuracy: 0.9655 - val_loss: 0.1109
Epoch 2/3
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 346ms/step - accuracy: 0.9688 - loss: 0.0908 - val_accuracy: 0.9693 - val_loss: 0.0800
Epoch 3/3
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 344ms/step - accuracy: 0.9764 - loss: 0.0630 - val_accuracy: 0.9655 - val_loss: 0.0977
Tabular model trained.


In [5]:
# Step 4: Model Evaluation

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

# Evaluate CNN on test images
cnn_eval = cnn_model.evaluate(test_gen)
print("CNN Test Accuracy:", cnn_eval[1])

# Evaluate tabular model
y_pred = rf_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"RandomForest: acc={acc:.2f}, prec={prec:.2f}, rec={rec:.2f}, f1={f1:.2f}")
print("Confusion Matrix:\n", cm)

# Save models
cnn_model.save("image_model.h5")

import joblib
joblib.dump(rf_model, "tabular_model.pkl")
joblib.dump(scaler, "preprocessor.pkl")
print("Models saved successfully.")


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 332ms/step - accuracy: 0.8674 - loss: 0.3888




CNN Test Accuracy: 0.8637820482254028
RandomForest: acc=0.50, prec=0.43, rec=0.54, f1=0.48
Confusion Matrix:
 [[43 49]
 [31 37]]
Models saved successfully.


In [6]:
# Step 5: User Input & Final Prediction (with image upload)

from google.colab import files
import tensorflow as tf
from tensorflow.keras.preprocessing import image
import joblib
import numpy as np

#  Load trained models
cnn_model = tf.keras.models.load_model("image_model.h5")
rf_model = joblib.load("tabular_model.pkl")
scaler = joblib.load("preprocessor.pkl")
print("Models loaded successfully!\n")

# Upload X-ray image
print("Upload a chest X-ray image (JPEG/PNG)")
uploaded = files.upload()
img_path = list(uploaded.keys())[0]  # Automatically get uploaded filename
print(f"Uploaded image path: {img_path}\n")

# Collect tabular inputs
age = float(input("Enter Age: "))
gender = input("Gender (Male/Female): ").strip().lower()
fever = input("Fever (Yes/No): ").strip().lower()
cough_dur = float(input("Cough Duration (days): "))
chest_pain = input("Chest Pain (Yes/No): ").strip().lower()

# Image prediction
img = image.load_img(img_path, target_size=(224, 224))
img_array = image.img_to_array(img) / 255.0
img_array = np.expand_dims(img_array, axis=0)
img_pred_prob = cnn_model.predict(img_array)[0][0]

# Tabular prediction
gender_num = 1 if gender == "male" else 0
fever_num = 1 if fever == "yes" else 0
chest_pain_num = 1 if chest_pain == "yes" else 0

tab_features = np.array([[age, gender_num, fever_num, cough_dur, chest_pain_num]])
tab_features_scaled = scaler.transform(tab_features)
tab_pred_prob = rf_model.predict_proba(tab_features_scaled)[0][1]

# Combine results
final_score = 0.6 * img_pred_prob + 0.4 * tab_pred_prob
final_label = "Pneumonia Detected" if final_score > 0.5 else "Normal"

# Display results
print("\n=== Prediction Results ===")
print(f"Tabular Model Prediction:  {tab_pred_prob*100:.2f}% Pneumonia")
print(f"Image Model Prediction:    {img_pred_prob*100:.2f}% Pneumonia")
print(f"Final Combined Decision:   {final_label}")
print("==============")




Models loaded successfully!

Upload a chest X-ray image (JPEG/PNG)


KeyboardInterrupt: 