In [1]:
import os
import pandas as pd
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import numpy as np

In [4]:
base_path = '../data'
folders = ['normal', 'lung_opacity', 'covid', 'pneumonia']

In [5]:
data = []

for folder in folders:
    folder_path = os.path.join(base_path, folder)
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            file_path = os.path.join(folder_path, filename)
            data.append((file_path, folder))


In [6]:
df = pd.DataFrame(data, columns=['image_path', 'label'])

In [7]:
print(df.head())
print(df.count())

                        image_path   label
0      ../data\normal\Normal-1.png  normal
1     ../data\normal\Normal-10.png  normal
2    ../data\normal\Normal-100.png  normal
3   ../data\normal\Normal-1000.png  normal
4  ../data\normal\Normal-10000.png  normal
image_path    21165
label         21165
dtype: int64


In [15]:
df.to_csv('data/all_images.csv', index=False)

In [10]:

# Function to load and preprocess images
def data_preprocessor(image_path, target_size=(64, 64)):
    image = load_img(image_path, target_size=target_size, color_mode='grayscale')  # Resize and convert to grayscale
    image = img_to_array(image)
    image = image.flatten()  # Flatten the image
    image /= 255.0  # Normalize pixel values to [0, 1]
    return image

# Apply the function to each image in the DataFrame
df['features'] = df['image_path'].apply(data_preprocessor)

In [17]:
# Convert the features into a numpy array and stack them into a matrix
X = np.stack(df['features'].values)

# Encode the labels as integers
y = df['label'].astype('category').cat.codes

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
# Initialize the Logistic Regression model
model = RandomForestClassifier(n_estimators=100, random_state=42)  # Increase max_iter if needed for convergence

# Train the model on the training data
model.fit(X_train, y_train)


In [19]:
# Predict the labels for the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Generate a classification report
report = classification_report(y_test, y_pred, target_names=df['label'].astype('category').cat.categories)
print(report)


Accuracy: 83.44%
              precision    recall  f1-score   support

       covid       0.90      0.69      0.78       724
lung_opacity       0.79      0.77      0.78      1191
      normal       0.83      0.92      0.87      2056
   pneumonia       0.94      0.85      0.90       262

    accuracy                           0.83      4233
   macro avg       0.87      0.81      0.83      4233
weighted avg       0.84      0.83      0.83      4233



In [38]:

"""def data_aug(images_path, nb_copy):
    datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    folder_path = images_path


    # Loop through the images in the folder
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith('.jpg') or filename.endswith('.png'):  # Check for image files
            image_path = os.path.join(folder_path, filename)
            image = load_img(image_path)  # Load the image
            x = img_to_array(image)  # Convert the image to a numpy array
            x = x.reshape((1,) + x.shape)  # Reshape the image
            
            # Generate augmented 
            i = 0
            for _ in datagen.flow(x, batch_size=1, save_to_dir=folder_path, save_prefix='aug', save_format='png'):
                i += 1
                if i >= nb_copy:
                    break  # Stop the loop after generating the desired number of augmented images

"""

In [39]:
"""data_aug('data/pneumonia', 1)"""

  5%|▍         | 123/2597 [00:04<01:37, 25.25it/s]


KeyboardInterrupt: 