# Exploratory Data Analysis: coffee leaf

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
from PIL import Image, UnidentifiedImageError

dataset_path = './dataset/Photos'
image_files = [f for f in os.listdir(dataset_path) if f.lower().endswith(('.jpg', '.png'))]

print(f"Total images in dataset: {len(image_files)}")


#### Check for images integrity
The check for images integrity, will check for corrupt files, such as images that cannot be open.
Then, it returns the corrupt file when found and remove this file from bytecode array.

In [None]:

corrupted_images = []

for img_file in image_files:
    img_path = os.path.join(dataset_path, img_file)

    try:
        with Image.open(img_path) as img:
            img.verify()
    except (UnidentifiedImageError, IOError):
        corrupted_images.append(img_file)

if corrupted_images:
    print(f"Found {len(corrupted_images)} corrupted images:")
    for img in corrupted_images:
        print(img)
else:
    print("No corrupted images found!")

for img_file in corrupted_images:
    os.remove(os.path.join(dataset_path, img_file))
    print(f"Deleted corrupted file: {img_file}")


### Check for images saturation


In [None]:
def compute_brightness(image):
    return np.mean(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))  # Convert to grayscale & compute mean

brightness_values = [compute_brightness(cv2.imread(os.path.join(dataset_path, img_file)))
                     for img_file in image_files if cv2.imread(os.path.join(dataset_path, img_file)) is not None]

plt.hist(brightness_values, bins=30, color='gray')
plt.title("Brightness Distribution")
plt.xlabel("Brightness")
plt.ylabel("Frequency")
plt.show()


#### Check for blurry images

In [None]:
def variance_of_laplacian(image):
    return cv2.Laplacian(image, cv2.CV_64F).var()  # Compute Laplacian variance

blur_scores = [variance_of_laplacian(cv2.imread(os.path.join(dataset_path, img_file), cv2.IMREAD_GRAYSCALE))
               for img_file in image_files if cv2.imread(os.path.join(dataset_path, img_file)) is not None]

plt.hist(blur_scores, bins=30, color='blue')
plt.title("Blurriness Distribution")
plt.xlabel("Variance of Laplacian")
plt.ylabel("Frequency")
plt.show()


In [None]:
means = {'R': [], 'G': [], 'B': []}

for img_file in image_files:
    img = cv2.imread(os.path.join(dataset_path, img_file))
    if img is not None:
        b, g, r = cv2.mean(img)[:3]  # Extract mean BGR values
        means['B'].append(b)
        means['G'].append(g)
        means['R'].append(r)

plt.figure(figsize=(12, 6))
for color in ['R', 'G', 'B']:
    plt.hist(means[color], bins=30, alpha=0.6, label=color, color=color.lower())
plt.title("Color Channel Intensity Distribution")
plt.xlabel("Intensity")
plt.ylabel("Frequency")
plt.legend()
plt.show()


#### Clustering similar images using K-MEANS

In [None]:
from sklearn.cluster import KMeans

def extract_color_histogram(image_path, bins=(8, 8, 8)):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([img], [0, 1, 2], None, bins, [0, 256, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

features = np.array([extract_color_histogram(os.path.join(dataset_path, img_file))
                     for img_file in image_files if cv2.imread(os.path.join(dataset_path, img_file)) is not None])
num_clusters = 2  # Change this based on expected groups
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
labels = kmeans.fit_predict(features)

clustered_images = {i: [] for i in range(num_clusters)}
for img_file, label in zip(image_files, labels):
    clustered_images[label].append(img_file)

# Print results
for cluster, images in clustered_images.items():
    print(f"Cluster {cluster}: {len(images)} images")


#### Model training with SVC

In [None]:
file_path = "./dataset/RoCoLe-classes.xlsx"
df = pd.read_excel(file_path)

df.head()


