In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = np.load('/Users/rishabhtiwari/Documents/Uni Heidelberg/Academic Year Erasmus/Semester 1/Artificial Neural Network and Deep Learning/Practical/ANN-DL-Competition1/public_data.npz', allow_pickle=True)
# Assuming the dataset has two arrays 'X' for features and 'y' for labels
X = data['data']  # Assuming this is your feature data
y = data['labels']  # Assuming this is your label data

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input

# Load pre-trained VGG16 model without the top classification layer
model = VGG16(include_top=False, input_shape=(96, 96, 3), pooling='avg')

# Preprocess the images and predict to get feature vectors
X_preprocessed = preprocess_input(X)
features = model.predict(X_preprocessed)

from sklearn.decomposition import PCA

# Reduce the features to a manageable number before clustering
pca = PCA(n_components=50)  # For example, reduce to 50 principal components
X_pca = pca.fit_transform(features)

from sklearn.cluster import DBSCAN

# Cluster the data
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(X_pca)

# Identify cluster numbers; for DBSCAN, -1 indicates outliers
plant_clusters = set(clusters) - {-1}

# Assuming non-plant images are outliers and don't belong to any cluster
plant_indices = [i for i, cluster in enumerate(clusters) if cluster in plant_clusters]
X_plants = X[plant_indices]
y_plants = y[plant_indices]

import matplotlib.pyplot as plt

# Display some images considered as outliers
outlier_indices = [i for i, cluster in enumerate(clusters) if cluster == -1]
sample_outliers = np.random.choice(outlier_indices, size=5, replace=False)

fig, axes = plt.subplots(1, 5, figsize=(15, 3))
for i, ax in enumerate(axes):
    ax.imshow(X[sample_outliers[i]].astype('uint8'))
    ax.axis('off')
plt.show()

# Save the clean dataset
np.savez('clean_plant_dataset.npz', X=X_plants, y=y_plants)

from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create an image data generator for augmentation
datagen = ImageDataGenerator(
    rotation_range=20,       # Random rotations
    width_shift_range=0.2,   # Random horizontal shifts
    height_shift_range=0.2,  # Random vertical shifts
    shear_range=0.2,         # Random shears
    zoom_range=0.2,          # Random zoom
    horizontal_flip=True,    # Random horizontal flips
    fill_mode='nearest'      # Strategy for filling in newly created pixels
)

# Compute quantities required for featurewise normalization
datagen.fit(X_plants)

from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_plants, y_plants, test_size=0.2, random_state=42)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Build the model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(96, 96, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

