**STARTING THE GPU AND TESTING IT**

In [1]:
import tensorflow as tf
tf.keras.backend.clear_session()

In [2]:

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("GPU not found")


Num GPUs Available:  1
Default GPU Device: /device:GPU:0


**IMPORTING THE NECESSARY LIBRARIES AND THIEIR MODULES**

In [3]:
import numpy as np                                                    # For numerical operations
import cv2                                                            # OpenCV for image processing
from sklearn.model_selection import train_test_split                  # For data splitting
from sklearn import svm                                               # Support Vector Machine from scikit-learn
from sklearn.metrics import classification_report, confusion_matrix   # Evaluation metrics
import matplotlib.pyplot as plt                                       # For visualization
import os                                                             # For file and directory operations


**MOUNTING THE DRIVE**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**ADDING THE DATABASE AND CHECKING ITS FILES**

In [5]:
dataset_path = '/content/drive/My Drive/Colab Notebooks/dataset'

In [6]:
# List files in the dataset directory
dataset_contents = os.listdir(dataset_path)
print(dataset_contents)


['labels.csv', 'labels.gsheet', 'images']


**IMPORTING THE DATA AND PREPROCESSING IT**


Using pickle because of again and again session crash due to overused RAM

In [7]:
import pandas as pd
import pickle

# Define a function to preprocess and save data in batches
def preprocess_and_save_data(image_folder, labels_file, batch_size, output_folder):
    batched_data = []

    # Load labels from 'labels.csv'
    labels_df = pd.read_csv(os.path.join(dataset_path, labels_file))

    # Iterate over image files in batches
    for filename in os.listdir(image_folder):
        if filename.endswith('.png'):
            image = cv2.imread(os.path.join(image_folder, filename))
            image = cv2.resize(image, (128, 128))              # Resizing

            label = labels_df.loc[labels_df['Image Index'] == filename, 'Finding Labels'].values[0]    #Mapping image with its label
            batched_data.append((image, label))

            # Check if the batch size is reached
            if len(batched_data) == batch_size:
                with open(os.path.join(output_folder, f'batch_{len(os.listdir(output_folder))}.pkl'), 'wb') as file:    #saving data
                    pickle.dump(batched_data, file)
                batched_data = []

    # Save any remaining data as the last batch
    if batched_data:
        with open(os.path.join(output_folder, f'batch_{len(os.listdir(output_folder))}.pkl'), 'wb') as file:
            pickle.dump(batched_data, file)

# Define batch size and output folder
batch_size = 50
output_folder = '/content/drive/My Drive/Colab Notebooks/output'

# Preprocess and save data in batches
preprocess_and_save_data(os.path.join(dataset_path, 'images'), 'labels.csv', batch_size, output_folder)


In [8]:
# Load labels from 'labels.csv'
labels_df = pd.read_csv(os.path.join(dataset_path, 'labels.csv'))

# Extract labels from the 'label' column
labels = labels_df['Finding Labels'].tolist()


In [9]:
# Check class distribution
from collections import Counter
class_distribution = Counter(labels)
print("Class distribution:", class_distribution)

Class distribution: Counter({'No Finding': 3044, 'Infiltration': 503, 'Effusion': 203, 'Atelectasis': 192, 'Nodule': 144, 'Pneumothorax': 114, 'Mass': 99, 'Consolidation': 72, 'Effusion|Infiltration': 69, 'Pleural_Thickening': 65, 'Atelectasis|Infiltration': 57, 'Atelectasis|Effusion': 55, 'Cardiomegaly': 50, 'Infiltration|Nodule': 44, 'Emphysema': 42, 'Edema': 41, 'Fibrosis': 38, 'Atelectasis|Effusion|Infiltration': 31, 'Cardiomegaly|Effusion': 30, 'Infiltration|Mass': 29, 'Edema|Infiltration': 21, 'Effusion|Pneumothorax': 20, 'Infiltration|Pneumothorax': 19, 'Consolidation|Infiltration': 18, 'Mass|Nodule': 17, 'Mass|Pneumothorax': 16, 'Effusion|Mass': 16, 'Emphysema|Pneumothorax': 15, 'Atelectasis|Consolidation': 15, 'Effusion|Pleural_Thickening': 14, 'Pneumonia': 14, 'Infiltration|Pleural_Thickening': 12, 'Consolidation|Effusion': 12, 'Atelectasis|Nodule': 12, 'Atelectasis|Consolidation|Effusion': 11, 'Effusion|Nodule': 11, 'Cardiomegaly|Infiltration': 10, 'Edema|Effusion|Infiltrati

**SPLITTING THE DATA INTO TEST AND TRAINIG DATA**

In [10]:

output_folder = '/content/drive/My Drive/Colab Notebooks/output'

# Load and combine batched data
combined_data = []
batch_files = os.listdir(output_folder)
for batch_file in batch_files:
    with open(os.path.join(output_folder, batch_file), 'rb') as file:
        batch_data = pickle.load(file)
        combined_data.extend(batch_data)

# Split the combined data into training (80%) and testing (20%) sets
X, y = zip(*combined_data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#X_train and y_train contain 80% of the data for training,
#X_test and y_test contain 20% of the data for testing


**DATA AUGMENTATION**

In [11]:

# Define functions for data augmentation
def rotate_image(image, angle_range=15):
    angle = np.random.uniform(-angle_range, angle_range)
    rows, cols, _ = image.shape
    M = cv2.getRotationMatrix2D((cols / 2, rows / 2), angle, 1)
    return cv2.warpAffine(image, M, (cols, rows))

def zoom_image(image, zoom_range=(0.9, 1.1)):
    scale_factor = np.random.uniform(*zoom_range)
    return cv2.resize(image, None, fx=scale_factor, fy=scale_factor)

def scale_image(image, width_range=(200, 250), height_range=(200, 250)):
    width = np.random.randint(*width_range)
    height = np.random.randint(*height_range)
    return cv2.resize(image, (width, height))

#directory to store augmented images
output_folder = '/content/drive/My Drive/Colab Notebooks/augmented_images'
os.makedirs(output_folder, exist_ok=True)

batch_size = 50  # Number of augmented images to save per batch
# Create a list to store augmented images and labels
augmented_data = []

for i, image in enumerate(X_train):

    rotated_image = rotate_image(image, angle_range=15)    #rotation within a range of -15 to 15 degrees


    zoomed_image = zoom_image(rotated_image, zoom_range=(0.9, 1.1))     #zoom within the range of 0.9 to 1.1

    #scaling within specified width and height ranges
    scaled_image = scale_image(zoomed_image, width_range=(200, 250), height_range=(200, 250))


    label = y_train[i]
    augmented_data.append((scaled_image, label))

    # Check if the batch size is reached or it's the last image
    if len(augmented_data) == batch_size or i == len(X_train) - 1:

        with open(os.path.join(output_folder, f'augmented_batch_{i // batch_size}.pkl'), 'wb') as file:
            pickle.dump(augmented_data, file)

        augmented_data = []

# Save any remaining data as the last batch
if augmented_data:
    with open(os.path.join(output_folder, f'augmented_batch_{len(X_train) // batch_size}.pkl'), 'wb') as file:
        pickle.dump(augmented_data, file)


**SVM MODEL TRAINING**

In [12]:

from sklearn.metrics import accuracy_score


# Load the preprocessed and augmented training data
training_data_folder = '/content/drive/My Drive/Colab Notebooks/augmented_images'

# Function to load data batches from pickle files
def load_data_batches(folder):
    data_batches = []
    for file_name in os.listdir(folder):
        if file_name.endswith('.pkl'):
            with open(os.path.join(folder, file_name), 'rb') as file:
                data_batches.extend(pickle.load(file))
    return data_batches

# Load the training data
training_data = load_data_batches(training_data_folder)

# Resize all images to a consistent shape
resize_shape = (128, 128)
training_data_resized = [(cv2.resize(image, resize_shape), label) for image, label in training_data]

# Split the training data into images and labels
X_train, y_train = zip(*training_data_resized)
X_train = np.array(X_train)
y_train = np.array(y_train)

# Create an SVM model with a linear kernel
svm_model = svm.SVC(kernel='linear', C=1.0)

# Train the SVM model
svm_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)

#saving the trained model
model_save_path = '/content/drive/My Drive/Colab Notebooks/model.pkl'
with open(model_save_path, 'wb') as model_file:
    pickle.dump(svm_model, model_file)

# Make predictions on the training data (you can also evaluate on a separate test set)
y_pred = svm_model.predict(X_train.reshape(X_train.shape[0], -1))

# Calculate accuracy on the training data
accuracy = accuracy_score(y_train, y_pred)
print(f"Training Accuracy: {accuracy * 100:.2f}%")


Training Accuracy: 100.00%


**EVALUATING THE MODEL**

In [15]:
# Define a function to preprocess test images
def preprocess_test_images(test_images):
    processed_images = []

    for image in test_images:
        # Resize the image to the desired resolution
        processed_image = cv2.resize(image, (128, 128))
        processed_images.append(processed_image)

    return np.array(processed_images)  # Convert the list to a NumPy array

# Preprocess your test data
X_test_processed = preprocess_test_images(X_test)

# Now you can make predictions
y_pred_test = svm_model.predict(X_test_processed.reshape(X_test_processed.shape[0], -1))


In [23]:
# Preprocess the test data if necessary (resize, flatten, etc.)
# Ensure that all test images have the same dimensions and are properly flattened

# Initialize a list to store the flattened test images
X_test_flat = []

# Preprocess and flatten each test image
for image in X_test:
    # Preprocess the image (e.g., resize)
    preprocessed_image = cv2.resize(image, (128, 128))

    # Flatten the preprocessed image
    flattened_image = preprocessed_image.reshape(-1)

    # Append the flattened image to the list
    X_test_flat.append(flattened_image)

# Convert the list of flattened images to a NumPy array
X_test_flat = np.array(X_test_flat)

# Make predictions on the flattened test data
y_pred_test = svm_model.predict(X_test_flat)

# Calculate accuracy on the test data
accuracy_test = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {accuracy_test * 100:.2f}%")

# Generate a classification report
report = classification_report(y_test, y_pred_test)
print("Classification Report:\n", report)

# Generate a confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:\n", confusion_mat)


Test Accuracy: 46.05%
Classification Report:
                                                                 precision    recall  f1-score   support

                                                   Atelectasis       0.16      0.33      0.21        72
                                      Atelectasis|Cardiomegaly       0.00      0.00      0.00         3
                        Atelectasis|Cardiomegaly|Consolidation       1.00      1.00      1.00         2
                                     Atelectasis|Consolidation       1.00      0.11      0.20         9
                               Atelectasis|Consolidation|Edema       0.00      0.00      0.00         1
                            Atelectasis|Consolidation|Effusion       1.00      0.17      0.29         6
                  Atelectasis|Consolidation|Effusion|Emphysema       0.00      0.00      0.00         1
Atelectasis|Consolidation|Effusion|Fibrosis|Pleural_Thickening       0.00      0.00      0.00         1
               At

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
