# Imports

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Model
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.preprocessing import image as keras_image


# Train/Validate/Test

In [None]:
# Paths
merged_dataset_path = '/Users/maggie/Desktop/project_3/Plant_ID_and_Diagnosis/Resources/plant_dr_master_dataset/master_dataset'
combined_csv_path = '/Users/maggie/Desktop/project_3/Plant_ID_and_Diagnosis/Resources/plant_dr_master_dataset/master_dataset/combined_labels.csv'

# Load the CSV file
df = pd.read_csv(combined_csv_path)

# Print the first few rows to verify
print(df.head())

# Split the dataset into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# Print the distribution to verify the split
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(valid_df)}")
print(f"Test set size: {len(test_df)}")

# ImageDataGenerator setup
img_size = (128, 128)
datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=20,       # Randomly rotate images in the range (degrees, 0 to 180)
    width_shift_range=0.2,   # Randomly shift images horizontally (fraction of total width)
    height_shift_range=0.2,  # Randomly shift images vertically (fraction of total height)
    shear_range=0.2,         # Shear Intensity (Shear angle in counter-clockwise direction in degrees)
    zoom_range=0.2,          # Randomly zoom image
    horizontal_flip=True,    # Randomly flip images horizontally
    fill_mode='nearest'      # Points outside the boundaries of the input are filled according to the given mode
)

# Load training data using flow_from_dataframe
train_generator = datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=merged_dataset_path,
    x_col='filename',  # Column name in the CSV file that contains the image file names
    y_col='label',  # Column name in the CSV file that contains the labels
    target_size=img_size,
    batch_size=32,
    class_mode='categorical'
)

# Load validation data using flow_from_dataframe
valid_generator = datagen.flow_from_dataframe(
    dataframe=valid_df,
    directory=merged_dataset_path,
    x_col='filename',
    y_col='label',
    target_size=img_size,
    batch_size=32,
    class_mode='categorical'
)

# Load test data using flow_from_dataframe
test_generator = datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=merged_dataset_path,
    x_col='filename',
    y_col='label',
    target_size=img_size,
    batch_size=32,
    class_mode='categorical',
    shuffle=False  # No need to shuffle for testing
)


# Unsupervised Learning

In [None]:
# Unsupervised Learning - Apply only after training is complete

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Extract features
feature_model = Model(inputs=model.input, outputs=model.layers[-2].output)  # Using the layer before the output
features = feature_model.predict(train_generator)  # Using the training set for feature extraction

# Apply PCA
pca = PCA(n_components=30)
reduced_features = pca.fit_transform(features)

# Apply K-Means Clustering
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(reduced_features)

# Visualize clusters
plt.figure(figsize=(10, 7))
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=clusters, cmap='viridis')
plt.colorbar()
plt.title('K-Means Clustering of Plant and Ailment Features')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()


# Supervised Learning

In [None]:
# Load the base model
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Freeze base model layers
for layer in base_model.layers:
    layer.trainable = False

# Add custom layers
x1 = Flatten()(base_model.output)
x2 = Dense(1024, activation='relu')(x1)

# Determine the number of classes
num_classes = len(train_generator.class_indices)  # Number of unique classes

# Output layer
x3 = Dense(num_classes, activation='softmax')(x2)

model = Model(inputs=base_model.input, outputs=x3)


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_generator, validation_data=valid_generator, epochs=7)


In [None]:
# Evaluate the model on the validation set

val_loss, val_accuracy = model.evaluate(valid_generator)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")


In [None]:
# Unfreeze the last few layers of the VGG16 model for fine-tuning
for layer in base_model.layers[-4:]:
    layer.trainable = True

# Recompile the model with a lower learning rate for fine-tuning
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss='categorical_crossentropy', metrics=['accuracy'])

# Re-train the model with fine-tuning
fine_tune_history = model.fit(
    train_generator,
    validation_data=valid_generator,
    epochs=7,
)


In [None]:
# Evaluate the fine-tuned model on the test set
fine_tuned_test_loss, fine_tuned_test_accuracy = model.evaluate(test_generator)
print(f"Fine-Tuned Test Loss: {fine_tuned_test_loss}")
print(f"Fine-Tuned Test Accuracy: {fine_tuned_test_accuracy}")


# Misclassified Images

In [None]:
# Obtain predictions for the test set
predictions = model.predict(test_generator)
predicted_labels = np.argmax(predictions, axis=1)
true_labels = test_generator.classes

# Identify misclassified indices
misclassified_indices = np.where(predicted_labels != true_labels)[0]

# Map class indices to class labels
class_labels = list(test_generator.class_indices.keys())

# Display some misclassified images
num_misclassified_to_display = 10  # Number of misclassified images to display
plt.figure(figsize=(15, 15))
for i, index in enumerate(misclassified_indices[:num_misclassified_to_display]):
    plt.subplot(5, 2, i + 1)
    
    # Load and preprocess the image
    img_path = test_generator.filepaths[index]
    img = keras_image.load_img(img_path, target_size=(128, 128))
    img = keras_image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = img / 255.0  # Rescale if needed
    
    plt.imshow(img[0])
    plt.axis('off')
    
    true_label = class_labels[true_labels[index]]
    predicted_label = class_labels[predicted_labels[index]]
    plt.title(f"True: {true_label}\nPred: {predicted_label}")

plt.tight_layout()
plt.show()


# Predict on Unseen Images

In [None]:
from tensorflow.keras.preprocessing import image
import numpy as np

def load_and_preprocess_image(img_path):
    img = image.load_img(img_path, target_size=img_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension
    img_array /= 255.0  # Rescale the image
    return img_array

# Predict a single image
new_image_path = '/Users/maggie/Desktop/project_3/Plant_ID_and_Diagnosis/Resources/plant_identifcation_resources/crop_pest_and_disease/tomato/healthy/healthy16_.jpg'  # Replace with your image path
img_array = load_and_preprocess_image(new_image_path)
prediction = model.predict(img_array)

# Map the prediction to class names
predicted_class_index = np.argmax(prediction)
predicted_class_name = list(train_generator.class_indices.keys())[predicted_class_index]

print(f"Predicted class: {predicted_class_name}")
