In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("techsash/waste-classification-data")

print("Path to dataset files:", path)

In [3]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import random

In [4]:
# importing ml libraries
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, BatchNormalization, Activation
from keras.utils import plot_model
from keras.preprocessing.image import img_to_array, load_img
from keras.callbacks import EarlyStopping

# from keras.proprecessing.image import ImageDataGenerator # this is not working
from keras.src.legacy.preprocessing.image import ImageDataGenerator
from glob import glob

In [5]:
# defining traing and test datasets paths
train_path = 'C:/Users/voutl/.cache/kagglehub/datasets/techsash/waste-classification-data/versions/1/DATASET/TRAIN'
test_path = 'C:/Users/voutl/.cache/kagglehub/datasets/techsash/waste-classification-data/versions/1/DATASET/TEST'

In [None]:
x_data = []
y_data = []

widths = []
heights = []

for category in glob(train_path + '/*'):
    for file in tqdm(glob(category + '/*')):
        img_arr = cv2.imread(file)
        img_arr = cv2.cvtColor(img_arr, cv2.COLOR_BGR2RGB)
        h, w = img_arr.shape[:2]
        widths.append(w)
        heights.append(h)
        x_data.append(img_arr)
        y_data.append(category.split('/')[-1])

data = pd.DataFrame({'image': x_data, 'label': y_data})

In [None]:
data.shape

In [None]:
indices = random.sample(range(data.shape[0]), 5)

plt.figure(figsize=(20, 4 * 5))  # Adjust figure size as needed
for i, index in enumerate(indices):
    plt.subplot(1, 5, i + 1)
    plt.imshow(x_data[index])
    plt.title(y_data[index])
    plt.axis('off')
plt.show()

In [None]:
plt.pie(data['label'].value_counts(), labels=["Organic", "Recyclable"], autopct='%1.2f%%', startangle=90)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.hist(widths, bins=20)
plt.title('Width Distribution')
plt.subplot(1, 2, 2)
plt.hist(heights, bins=20)
plt.title('Height Distribution')
plt.show()

print(f"Mean Width: {np.mean(widths)}, Mean Height: {np.mean(heights)}")

In [19]:
# dimensions for resizing
img_width = 224
img_height = 224

# batch size for generating augmented data
batch_size = 64

In [None]:
data_generator = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

train_data_generator = data_generator.flow_from_directory(
    train_path,
    target_size = (img_width, img_height),
    batch_size = batch_size,
    color_mode = "rgb",
    class_mode = "binary",
    subset = 'validation'
)

validation_data_generator = data_generator.flow_from_directory(
    train_path,
    target_size = (img_width, img_height),
    batch_size = batch_size,
    color_mode = "rgb",
    class_mode = "binary",
    subset = 'validation'
)

In [None]:
test_data_generator = ImageDataGenerator(rescale=1./255)

test_generator = test_data_generator.flow_from_directory(
    test_path,
    target_size = (224, 224),
    batch_size = batch_size,
    color_mode = "rgb",
    class_mode = "binary"
)

In [41]:
model = Sequential()

model.add(Conv2D(32, (3, 3), input_shape=(224, 224, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D())

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D())

model.add(Conv2D(128, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D())

model.add(Flatten())

model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(1))
model.add(Activation('sigmoid'))

In [None]:
model.summary()

In [43]:
# Early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,       
    restore_best_weights=True  
)

In [46]:
model.compile(loss = "binary_crossentropy",
              optimizer = "adam",
              metrics = ["accuracy"])

In [None]:
# Fit the model
hist = model.fit(
    train_data_generator,
    epochs=5,  # Increased epochs for training
    validation_data=validation_data_generator, 
    callbacks=[early_stopping]  # Include early stopping
)

In [None]:
# Plot accuracy and loss curves
plt.figure(figsize=(14, 5))

# Accuracy Plot
plt.subplot(1, 2, 1)
plt.plot(hist.history['accuracy'], label='Train Accuracy')
plt.plot(hist.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

# Loss Plot
plt.subplot(1, 2, 2)
plt.plot(hist.history['loss'], label='Train Loss')
plt.plot(hist.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()

plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Predict on the test set
predictions = model.predict(test_generator)
predicted_classes = (predictions > 0.5).astype('int32')

# Get true classes
true_classes = test_generator.classes
class_labels = list(test_generator.class_indices.keys())

# Generate confusion matrix
cm = confusion_matrix(true_classes, predicted_classes)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

# Print classification report
print(classification_report(true_classes, predicted_classes, target_names=class_labels))

In [None]:
from sklearn.metrics import roc_curve, auc

# Calculate ROC curve
fpr, tpr, _ = roc_curve(true_classes, predictions)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Analyze misclassifications
misclassified_indices = np.where(predicted_classes.flatten() != true_classes)[0]

plt.figure(figsize=(15, 10))
for i, index in enumerate(misclassified_indices[:10]):  # Display first 10 misclassifications
    plt.subplot(2, 5, i+1)
    # Access the image from the test generator using the batch index and image index within the batch
    batch_index = index // test_generator.batch_size
    image_index = index % test_generator.batch_size
    image = test_generator[batch_index][0][image_index]
    plt.imshow(image.astype('float32'))
    plt.title(f'True: {class_labels[true_classes[index]]}\nPredicted: {class_labels[predicted_classes[index][0]]}')
    plt.axis('off')
plt.show()

In [None]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(test_generator)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')