
# Rice Leaf Disease Classification — Ready-to-Run Notebook
**Purpose:** End-to-end local Jupyter notebook for EDA, training a CNN, augmentation experiments, and model comparison — prepared for GitHub upload.

**Files used (from your upload):**
- Dataset ZIP: `/mnt/data/riceleafdataset.zip`
- Project brief: `/mnt/data/PRCP-1001-RiceLeaf.docx`  
(These local paths are already present in this environment.)

**Instructions:**  
1. Run all cells sequentially.  
2. If you want to train faster, reduce `EPOCHS` or use a GPU runtime.  
3. Transfer-learning section will try to load ImageNet weights; if internet is unavailable it falls back to training without pretrained weights (the code handles that gracefully).


In [None]:

# 1) Imports and unzip dataset
import os
import zipfile
from pathlib import Path

DATA_ZIP = "/mnt/data/riceleafdataset.zip"   # <--- dataset zip uploaded by you
EXTRACT_TO = "/mnt/data/riceleafdataset"

os.makedirs(EXTRACT_TO, exist_ok=True)

if os.path.exists(DATA_ZIP):
    with zipfile.ZipFile(DATA_ZIP, 'r') as z:
        z.extractall(EXTRACT_TO)
    print("Extracted to:", EXTRACT_TO)
else:
    print("Dataset zip not found at:", DATA_ZIP)

# show folder structure (first two levels)
for root, dirs, files in os.walk(EXTRACT_TO):
    level = root.replace(EXTRACT_TO, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{Path(root).name}/")
    if level < 2:
        for f in files[:5]:
            print(f"{indent}  - {f}")


In [None]:

# 2) Basic EDA - class counts, sample images, image sizes
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import random

# Try to detect image folder inside the extracted directory
def find_image_folder(base):
    for root, dirs, files in os.walk(base):
        imgs = [f for f in files if f.lower().endswith(('.jpg','.jpeg','.png'))]
        if imgs:
            return root
    return None

IMG_FOLDER = find_image_folder("/mnt/data/riceleafdataset")
print("Image folder detected:", IMG_FOLDER)

from collections import defaultdict
class_counts = defaultdict(int)
samples = defaultdict(list)
if IMG_FOLDER:
    for root, dirs, files in os.walk(IMG_FOLDER):
        for f in files:
            if f.lower().endswith(('.jpg','.jpeg','.png')):
                cls = Path(root).name
                class_counts[cls] += 1
                if len(samples[cls]) < 5:
                    samples[cls].append(os.path.join(root,f))

print("Class counts:")
for k,v in class_counts.items():
    print(f" - {k}: {v}")

# class distribution plot
plt.figure(figsize=(6,4))
plt.bar(list(class_counts.keys()), list(class_counts.values()))
plt.title("Class distribution")
plt.ylabel("Number of images")
plt.xlabel("Class")
plt.tight_layout()
plt.show()

# sample images
plt.figure(figsize=(10,4))
i = 1
for cls, imgs in samples.items():
    if imgs:
        img = Image.open(imgs[0])
        plt.subplot(1, len(samples), i)
        plt.imshow(img)
        plt.title(cls)
        plt.axis('off')
        i += 1
plt.show()

# image size stats
sizes = []
for root, dirs, files in os.walk(IMG_FOLDER):
    for f in files:
        if f.lower().endswith(('.jpg','.jpeg','.png')):
            try:
                w,h = Image.open(os.path.join(root,f)).size
                sizes.append((w,h))
            except:
                pass

if sizes:
    sizes_arr = np.array(sizes)
    print("Image size stats (w x h):")
    print(" - min:", sizes_arr.min(axis=0))
    print(" - max:", sizes_arr.max(axis=0))
    print(" - median:", np.median(sizes_arr, axis=0))
else:
    print("No image sizes found.")


In [None]:

# 3) Prepare Keras ImageDataGenerators (train/val split) and utility functions
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

IMG_SIZE = (224,224)
BATCH_SIZE = 8
RANDOM_SEED = 42

# build file list and labels
filepaths = []
labels = []
for root, dirs, files in os.walk(IMG_FOLDER):
    for f in files:
        if f.lower().endswith(('.jpg','.jpeg','.png')):
            filepaths.append(os.path.join(root,f))
            labels.append(Path(root).name)

print("Total images found:", len(filepaths))

train_paths, val_paths, train_labels, val_labels = train_test_split(
    filepaths, labels, test_size=0.2, stratify=labels, random_state=RANDOM_SEED
)

train_datagen = ImageDataGenerator(rescale=1./255)
train_aug_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    zoom_range=0.15,
    brightness_range=(0.7,1.3),
    fill_mode='nearest'
)
val_datagen = ImageDataGenerator(rescale=1./255)

def paths_to_generator(paths, labels, datagen, shuffle=True):
    import pandas as pd
    df = pd.DataFrame({"filename": paths, "class": labels})
    gen = datagen.flow_from_dataframe(
        df,
        x_col="filename",
        y_col="class",
        target_size=IMG_SIZE,
        class_mode="categorical",
        batch_size=BATCH_SIZE,
        shuffle=shuffle
    )
    return gen

train_gen = paths_to_generator(train_paths, train_labels, train_datagen)
train_aug_gen = paths_to_generator(train_paths, train_labels, train_aug_datagen)
val_gen = paths_to_generator(val_paths, val_labels, val_datagen, shuffle=False)


In [None]:

# 4) Define a simple CNN model (baseline)
from tensorflow.keras import layers, models

def build_simple_cnn(input_shape=IMG_SIZE + (3,), num_classes=None):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv2D(32, (3,3), activation='relu')(inputs)
    x = layers.MaxPooling2D()(x)
    x = layers.Conv2D(64, (3,3), activation='relu')(x)
    x = layers.MaxPooling2D()(x)
    x = layers.Conv2D(128, (3,3), activation='relu')(x)
    x = layers.MaxPooling2D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = models.Model(inputs, outputs)
    return model

num_classes = train_gen.num_classes
model_baseline = build_simple_cnn(num_classes=num_classes)
model_baseline.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_baseline.summary()


In [None]:

# 5) Train baseline model (small epochs for quick testing)
EPOCHS = 8
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
]
history_baseline = model_baseline.fit(
    train_gen,
    validation_data=val_gen,
    epochs=EPOCHS,
    callbacks=callbacks
)
# plot accuracy and loss
plt.figure()
plt.plot(history_baseline.history['accuracy'], label='train_acc')
plt.plot(history_baseline.history['val_accuracy'], label='val_acc')
plt.title('Baseline Model Accuracy')
plt.legend()
plt.show()

plt.figure()
plt.plot(history_baseline.history['loss'], label='train_loss')
plt.plot(history_baseline.history['val_loss'], label='val_loss')
plt.title('Baseline Model Loss')
plt.legend()
plt.show()


In [None]:

# 6) Train model with data augmentation (same architecture)
model_aug = build_simple_cnn(num_classes=num_classes)
model_aug.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

EPOCHS = 12
history_aug = model_aug.fit(
    train_aug_gen,
    validation_data=val_gen,
    epochs=EPOCHS,
    callbacks=callbacks
)

plt.figure()
plt.plot(history_aug.history['accuracy'], label='train_acc_aug')
plt.plot(history_aug.history['val_accuracy'], label='val_acc_aug')
plt.title('Augmented Model Accuracy')
plt.legend()
plt.show()

plt.figure()
plt.plot(history_aug.history['loss'], label='train_loss_aug')
plt.plot(history_aug.history['val_loss'], label='val_loss_aug')
plt.title('Augmented Model Loss')
plt.legend()
plt.show()


In [None]:

# 7) Confusion matrix + classification report for best model (choose augmented model for example)
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

best_model = model_aug

val_steps = int(np.ceil(val_gen.samples / val_gen.batch_size))
preds = best_model.predict(val_gen, steps=val_steps)
y_pred = np.argmax(preds, axis=1)
y_true = val_gen.classes

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

plt.figure(figsize=(6,5))
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix")
plt.colorbar()
plt.xlabel("Predicted")
plt.ylabel("True")
plt.xticks(range(len(val_gen.class_indices)), list(val_gen.class_indices.keys()), rotation=45)
plt.yticks(range(len(val_gen.class_indices)), list(val_gen.class_indices.keys()))
plt.tight_layout()
plt.show()

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=list(val_gen.class_indices.keys())))


In [None]:

# 8) Transfer Learning (MobileNetV2) - try to use pretrained weights; fallback if not available
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import Model

def build_mobilenetv2(num_classes, input_shape=IMG_SIZE+(3,), weights='imagenet'):
    base = MobileNetV2(include_top=False, input_shape=input_shape, weights=weights)
    x = layers.GlobalAveragePooling2D()(base.output)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=base.input, outputs=outputs)
    return model

try:
    model_tl = build_mobilenetv2(num_classes=num_classes, weights='imagenet')
    print('Loaded MobileNetV2 with ImageNet weights.')
except Exception as e:
    print('Could not load pretrained weights (likely offline). Falling back to uninitialized MobileNetV2. Error:', e)
    model_tl = build_mobilenetv2(num_classes=num_classes, weights=None)

model_tl.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_tl.summary()

EPOCHS = 8
history_tl = model_tl.fit(
    train_aug_gen,
    validation_data=val_gen,
    epochs=EPOCHS,
    callbacks=callbacks
)

plt.figure()
plt.plot(history_tl.history['accuracy'], label='train_acc_tl')
plt.plot(history_tl.history['val_accuracy'], label='val_acc_tl')
plt.title('Transfer Learning Accuracy')
plt.legend()
plt.show()


In [None]:

# 9) Save the best model and label map
MODEL_OUT = "/mnt/data/rice_leaf_best_model.h5"
best_model.save(MODEL_OUT)
print("Saved model to:", MODEL_OUT)

import json
labels_map = {v: k for k, v in val_gen.class_indices.items()}
with open("/mnt/data/label_map.json", "w") as f:
    json.dump(labels_map, f)
print("Saved label map to /mnt/data/label_map.json")


In [None]:

# 10) Prediction helper: load model and predict on a new image
from tensorflow.keras.models import load_model
import numpy as np
from PIL import Image

def predict_image(img_path, model_path=MODEL_OUT, label_map_path="/mnt/data/label_map.json"):
    model = load_model(model_path)
    import json
    with open(label_map_path, "r") as f:
        label_map = json.load(f)
    img = Image.open(img_path).convert('RGB').resize(IMG_SIZE)
    arr = np.array(img)/255.0
    arr = np.expand_dims(arr, 0)
    preds = model.predict(arr)
    cls = np.argmax(preds, axis=1)[0]
    return label_map[str(cls)], float(np.max(preds))

# Example usage (uncomment and provide path):
# print(predict_image("/path/to/some_leaf.jpg"))



---

## Notes & Next Steps (for GitHub upload)
- Notebook saved as this single `.ipynb` file — suitable for local Jupyter and for pushing to GitHub.
- If you want to run faster, use a GPU runtime (Colab / local GPU). Reduce BATCH_SIZE or IMG_SIZE for quick tests.
- Transfer-learning with ImageNet weights needs internet. The code falls back if offline.
- I recommend adding a small `requirements.txt` when uploading to GitHub with pinned versions:
```
tensorflow
numpy
pandas
matplotlib
scikit-learn
Pillow
```
- If you want, I can also generate a ready `README.md` and `requirements.txt` and push all files into a ZIP for upload.
