In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split


In [None]:
# Supervised Learning

# Paths
merged_dataset_path = '/Users/maggie/Desktop/project_3/Plant_ID_and_Diagnosis/Resources/plant_dr_master_dataset/master_dataset'
combined_csv_path = '/Users/maggie/Desktop/project_3/Plant_ID_and_Diagnosis/Resources/plant_dr_master_dataset/master_dataset/combined_labels.csv'

# Load the CSV file
df = pd.read_csv(combined_csv_path)

# Split the dataset into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# Print the distribution to verify the split
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(valid_df)}")
print(f"Test set size: {len(test_df)}")

# ImageDataGenerator setup
img_size = (128, 128)
datagen = ImageDataGenerator(rescale=1./255)

# Load training data using flow_from_dataframe
train_generator = datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=merged_dataset_path,
    x_col='filename',  # Column name in the CSV file that contains the image file names
    y_col='label',  # Column name in the CSV file that contains the labels
    target_size=img_size,
    batch_size=32,
    class_mode='categorical'
)

# Load validation data using flow_from_dataframe
valid_generator = datagen.flow_from_dataframe(
    dataframe=valid_df,
    directory=merged_dataset_path,
    x_col='filename',
    y_col='label',
    target_size=img_size,
    batch_size=32,
    class_mode='categorical'
)

# Load test data using flow_from_dataframe
test_generator = datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=merged_dataset_path,
    x_col='filename',
    y_col='label',
    target_size=img_size,
    batch_size=32,
    class_mode='categorical',
    shuffle=False  # No need to shuffle for testing
)

# Load the base model (EfficientNetB0)
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Freeze base model layers
for layer in base_model.layers:
    layer.trainable = False

# Add custom layers
x1 = GlobalAveragePooling2D()(base_model.output)
x2 = Dense(1024, activation='relu')(x1)

# Determine the number of classes
num_classes = len(train_generator.class_indices)  # Number of unique classes

# Output layer
x3 = Dense(num_classes, activation='softmax')(x2)

model = Model(inputs=base_model.input, outputs=x3)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_generator, validation_data=valid_generator, epochs=5)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_generator)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Training set size: 53709
Validation set size: 11509
Test set size: 11510
Found 53709 validated image filenames belonging to 167 classes.
Found 11509 validated image filenames belonging to 167 classes.
Found 11510 validated image filenames belonging to 167 classes.
Epoch 1/5


  self._warn_if_super_not_called()


[1m1679/1679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m812s[0m 478ms/step - accuracy: 0.0680 - loss: 4.2192 - val_accuracy: 0.0698 - val_loss: 4.1718
Epoch 2/5
[1m1679/1679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 312ms/step - accuracy: 0.0687 - loss: 4.1712 - val_accuracy: 0.0698 - val_loss: 4.1569
Epoch 3/5
[1m1679/1679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 192ms/step - accuracy: 0.0695 - loss: 4.1549 - val_accuracy: 0.0698 - val_loss: 4.1482
Epoch 4/5
[1m1679/1679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - accuracy: 0.0668 - loss: 4.1578

In [None]:
# Evaluate the model on the validation set

val_loss, val_accuracy = model.evaluate(valid_generator)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")


In [None]:
# Unsupervised Learning - Apply only after training is complete

# Extract features
feature_model = Model(inputs=model.input, outputs=model.layers[-2].output)  # Using the layer before the output
features = feature_model.predict(train_generator)  # Using the training set for feature extraction

# Apply PCA
pca = PCA(n_components=50)
reduced_features = pca.fit_transform(features)

# Apply K-Means Clustering
kmeans = KMeans(n_clusters=10)
clusters = kmeans.fit_predict(reduced_features)

# Visualize clusters
plt.figure(figsize=(10, 7))
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=clusters, cmap='viridis')
plt.colorbar()
plt.title('K-Means Clustering of Plant and Ailment Features')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
