<a href="https://colab.research.google.com/github/Irfan-Riyad/-Customer-Segmentation-Using-RFM-Analysis/blob/main/Copy_of_cse475_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip -q install imagehash

Collecting imagehash
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Downloading ImageHash-4.3.2-py2.py3-none-any.whl (296 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m276.5/296.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imagehash
Successfully installed imagehash-4.3.2


In [3]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from skimage import img_as_float
from scipy.stats import skew, kurtosis
from imagehash import phash
from collections import Counter

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
data_dir = '/content/drive/MyDrive/Original_Data'

In [None]:
print("Dataset path:", data_dir)
print("Exists?", os.path.exists(data_dir))
print("Subfolders:", os.listdir(data_dir))

In [None]:
# ==========================
# 0️⃣ Copy dataset locally (optional but faster)
# ==========================
LOCAL_PATH = "/content/Original_Data"
DRIVE_PATH = "/content/drive/MyDrive/Original_Data"

if not os.path.exists(LOCAL_PATH):
    shutil.copytree(DRIVE_PATH, LOCAL_PATH)
    print(f"✅ Dataset copied locally to {LOCAL_PATH}")
else:
    print(f"✅ Local dataset already exists at {LOCAL_PATH}")

# ==========================
# 1️⃣ Detect classes from folder names
# ==========================
BASE_PATH = LOCAL_PATH
available_classes = [f for f in os.listdir(BASE_PATH) if os.path.isdir(os.path.join(BASE_PATH, f))]
print(f"Available class folders: {available_classes}")

if not available_classes:
    raise ValueError("❌ No class folders found! Check dataset path.")

CLASSES = available_classes
print(f"Using class folders: {CLASSES}")

# ==========================
# 2️⃣ Helper function to compute stats
# ==========================
def compute_image_stats(image):
    if image.mode != "RGB":
        image = image.convert("RGB")

    arr = np.asarray(image, dtype=np.float32) / 255.0
    hsv = np.asarray(image.convert("HSV"), dtype=np.float32) / 255.0

    stats = {}
    for i, c in enumerate(['R', 'G', 'B']):
        stats[f'{c}_mean'] = arr[:, :, i].mean()
        stats[f'{c}_std'] = arr[:, :, i].std()
    for i, c in enumerate(['H', 'S', 'V']):
        stats[f'{c}_mean'] = hsv[:, :, i].mean()
        stats[f'{c}_std'] = hsv[:, :, i].std()

    stats['brightness'] = hsv[:, :, 2].mean()
    stats['contrast'] = arr.std(axis=(0,1)).mean()

    s = hsv[:, :, 1]
    stats['sat_clip_low'] = np.mean(s < 0.05)
    stats['sat_clip_high'] = np.mean(s > 0.95)

    stats['width'], stats['height'] = image.size
    stats['aspect_ratio'] = image.width / image.height

    return stats

# ==========================
# 3️⃣ Process all images (optimized)
# ==========================
rows = []
MAX_IMAGES = None  # Optional: test run (e.g., 200)

for cls in CLASSES:
    folder = os.path.join(BASE_PATH, cls)
    img_paths = glob.glob(os.path.join(folder, "*.*"))
    if MAX_IMAGES:
        img_paths = img_paths[:MAX_IMAGES]

    if not img_paths:
        print(f"⚠️ No images found in folder: {folder}")
        continue

    print(f"Processing {len(img_paths)} images in class '{cls}'...")
    for path in tqdm(img_paths, desc=f"{cls} images"):
        if not path.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")):
            continue
        try:
            img = Image.open(path)
            stats = compute_image_stats(img)
            stats["class"] = cls
            stats["filename"] = os.path.basename(path)
            rows.append(stats)
        except:
            continue

if not rows:
    raise ValueError("❌ No valid images found! Check dataset path or folder structure.")

df = pd.DataFrame(rows)
print(f"✅ Total images processed: {len(df)}")
display(df.head())

# ==========================
# 4️⃣ Sample histogram visualization
# ==========================
sample_row = df.iloc[0]
sample_img_path = os.path.join(BASE_PATH, sample_row["class"], sample_row["filename"])
sample_img = Image.open(sample_img_path)
arr = np.asarray(sample_img)

plt.figure(figsize=(12,4))
for i,color in enumerate(['r','g','b']):
    plt.hist(arr[:,:,i].ravel(), bins=256, color=color, alpha=0.5, label=color.upper())
plt.title("RGB Histogram (Sample)")
plt.legend(); plt.show()

hsv = np.asarray(sample_img.convert("HSV"))
plt.figure(figsize=(12,4))
for i,color in enumerate(['h','s','v']):
    plt.hist(hsv[:,:,i].ravel(), bins=256, alpha=0.5, label=color.upper())
plt.title("HSV Histogram (Sample)")
plt.legend(); plt.show()

# ==========================
# 5️⃣ Per-class summary
# ==========================
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
summary = df.groupby("class")[numeric_cols].agg(["mean","std"])
print("=== Per-Class Mean & STD ===")
display(summary)

# ==========================
# 6️⃣ Brightness vs Contrast
# ==========================
plt.figure(figsize=(7,6))
sns.scatterplot(data=df, x="brightness", y="contrast", hue="class", alpha=0.7)
plt.title("Brightness vs Contrast Spread"); plt.show()

# ==========================
# 7️⃣ Saturation clipping
# ==========================
plt.figure(figsize=(6,5))
sns.boxplot(data=df, x="class", y="sat_clip_high")
plt.title("Saturation Clipping (High values)"); plt.show()

# ==========================
# 8️⃣ Resolution & Aspect Ratio
# ==========================
plt.figure(figsize=(7,5))
sns.histplot(data=df, x="width", bins=30, hue="class", element="step")
plt.title("Width Distribution per Class"); plt.show()

plt.figure(figsize=(7,5))
sns.histplot(data=df, x="height", bins=30, hue="class", element="step")
plt.title("Height Distribution per Class"); plt.show()

plt.figure(figsize=(7,5))
sns.histplot(data=df, x="aspect_ratio", bins=30, hue="class", element="step")
plt.title("Aspect Ratio Distribution per Class"); plt.show()

# ==========================
# 9️⃣ Suggested resize/padding
# ==========================
mean_w, mean_h = df['width'].mean(), df['height'].mean()
print(f"📏 Suggested resize target: ({int(mean_w)}x{int(mean_h)})")
print("""
💡 Suggested Strategy:
- Resize all images to fixed size (e.g., 256×256)
- Maintain aspect ratio by padding (black or mean color)
- For CNNs, random crop + resize augmentation recommended
""")

In [None]:
# Functions
def laplacian_variance(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return None
    lap = cv2.Laplacian(img, cv2.CV_64F)
    return lap.var()

def noise_proxy(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return None
    blurred = cv2.GaussianBlur(img, (3, 3), 0)
    noise = img.astype("float") - blurred.astype("float")
    return noise.var()

# Base path to your dataset in Google Drive
# Example: '/content/drive/MyDrive/ArsenicSkinImageBD/Original'
base_path = '/content/drive/MyDrive/Original_Data'
categories = ['infacted', 'not_infacted']

data = []

# Loop through each folder and image
for category in categories:
    folder_path = os.path.join(base_path, category)
    for filename in tqdm(os.listdir(folder_path)):
        img_path = os.path.join(folder_path, filename)
        lap_var = laplacian_variance(img_path)
        noise_val = noise_proxy(img_path)

        data.append({
            'filename': filename,
            'category': category,
            'laplacian_variance': lap_var,
            'noise_proxy': noise_val
        })

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in Colab
output_csv = '/content/laplacian_noise_results.csv'
df.to_csv(output_csv, index=False)
print(f"CSV saved to {output_csv}")

# Show mean stats by category
print(df.groupby('category')[['laplacian_variance', 'noise_proxy']].mean())

In [None]:
# ==============================
# 📘 Image Analysis & Duplicate Detection (GitHub-Safe Version)
# ==============================

from PIL import Image
import numpy as np
import os, glob, itertools, shutil
import pandas as pd
from tqdm import tqdm  # ✅ safer than tqdm.notebook for GitHub/nbconvert

# -------------------------------
# 0️⃣ Image Folder Path
# -------------------------------
IMAGE_FOLDER = "/content/drive/MyDrive/Original_Data"

# -------------------------------
# 1️⃣ Gray-World White Balance Function
# -------------------------------
def gray_world_correction(pil_image):
    """Apply Gray-World white balance correction to an image."""
    if pil_image.mode != 'RGB':
        pil_image = pil_image.convert('RGB')
    arr = np.asarray(pil_image).astype(np.float32)

    mean_r = arr[:, :, 0].mean()
    mean_g = arr[:, :, 1].mean()
    mean_b = arr[:, :, 2].mean()
    mean_gray = (mean_r + mean_g + mean_b) / 3.0

    # Scale each channel
    scale_r = mean_gray / mean_r
    scale_g = mean_gray / mean_g
    scale_b = mean_gray / mean_b

    arr[:, :, 0] = np.clip(arr[:, :, 0] * scale_r, 0, 255)
    arr[:, :, 1] = np.clip(arr[:, :, 1] * scale_g, 0, 255)
    arr[:, :, 2] = np.clip(arr[:, :, 2] * scale_b, 0, 255)

    return Image.fromarray(arr.astype(np.uint8))

# -------------------------------
# 2️⃣ Perceptual Hashing (aHash + dHash)
# -------------------------------
def ahash(image, hash_size=8):
    """Average Hash (aHash) - converts image to binary hash string."""
    image = image.convert('L').resize((hash_size, hash_size), Image.Resampling.LANCZOS)
    pixels = np.array(image)
    avg = pixels.mean()
    return ''.join(['1' if p > avg else '0' for p in pixels.flatten()])

def dhash(image, hash_size=8):
    """Difference Hash (dHash) - compares pixel gradients."""
    image = image.convert('L').resize((hash_size + 1, hash_size), Image.Resampling.LANCZOS)
    pixels = np.array(image)
    diff = pixels[:, 1:] > pixels[:, :-1]
    return ''.join(['1' if v else '0' for v in diff.flatten()])

def hamming_distance(hash1, hash2):
    """Calculate Hamming distance between two binary hash strings."""
    return sum(c1 != c2 for c1, c2 in zip(hash1, hash2))

# -------------------------------
# 3️⃣ Collect All Image Paths
# -------------------------------
image_paths = glob.glob(os.path.join(IMAGE_FOLDER, '**', '*.*'), recursive=True)
image_paths = [p for p in image_paths if p.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]

print(f"🔍 Total image paths found: {len(image_paths)}")
if len(image_paths) > 0:
    print("First few image paths:", image_paths[:3])

# -------------------------------
# 4️⃣ Process Each Image
# -------------------------------
rows = []
for img_path in tqdm(image_paths, desc="Processing images"):
    try:
        img = Image.open(img_path)
        wb_img = gray_world_correction(img)
        row = {
            "filename": os.path.basename(img_path),
            "path": img_path,
            "width": img.width,
            "height": img.height,
            "ahash": ahash(img),
            "dhash": dhash(img)
        }
        rows.append(row)
    except Exception as e:
        print(f"⚠️ Error processing {img_path}: {e}")

print("✅ Number of rows collected:", len(rows))
if len(rows) > 0:
    print("Sample keys:", list(rows[0].keys()))

# -------------------------------
# 5️⃣ Create DataFrame (Safe)
# -------------------------------
if not rows:
    print("❌ No valid image data found — check your IMAGE_FOLDER path or image formats.")
else:
    df = pd.DataFrame(rows)

    if "filename" in df.columns:
        df = df.sort_values("filename").reset_index(drop=True)
        print("\n✅ === Image Analysis Results ===")
        display(df.head())
    else:
        print("\n⚠️ 'filename' column missing — showing raw DataFrame instead.")
        display(df)

    # -------------------------------
    # 6️⃣ Duplicate Detection
    # -------------------------------
    print("\n🔁 Detecting duplicates (this may take some time)...")

    duplicates = []
    threshold = 5  # max hamming distance to consider duplicate

    for (i1, row1), (i2, row2) in itertools.combinations(df.iterrows(), 2):
        hd_ahash = hamming_distance(row1['ahash'], row2['ahash'])
        hd_dhash = hamming_distance(row1['dhash'], row2['dhash'])
        if hd_ahash <= threshold and hd_dhash <= threshold:
            duplicates.append({
                "file1": row1['filename'],
                "file2": row2['filename'],
                "ahash_dist": hd_ahash,
                "dhash_dist": hd_dhash
            })

    dup_df = pd.DataFrame(duplicates)
    print("\n✅ === Duplicate Image Report ===")
    if not dup_df.empty:
        display(dup_df)
    else:
        print("No duplicates detected ✅")

print("\n🎯 Analysis completed successfully!")


In [None]:
# ===============================
# STEP 1: Set dataset path & verify
# ===============================
import os

data_dir = '/content/drive/MyDrive/Original_Data'  #  Adjust if needed

print("Dataset path:", data_dir)
print("Exists?", os.path.exists(data_dir))
print("Subfolders:", os.listdir(data_dir))

# ===============================
# STEP 2: Recursively scan for JPG files & build DataFrame
# ===============================
import pandas as pd
import glob

# Accept both lower & upper case extensions
valid_ext = ('.jpg', '.jpeg', '.JPG', '.JPEG', '.png', '.PNG')

# Recursively collect file paths
file_paths = glob.glob(os.path.join(data_dir, '**', '*.*'), recursive=True)
print("Total files found recursively:", len(file_paths))

rows = []
for fpath in file_paths:
    if fpath.endswith(valid_ext):
        # Label = parent folder name (e.g., 'infacted', 'not_infacted')
        label = os.path.basename(os.path.dirname(fpath))
        # Group ID = filename stem or prefix before underscore
        fname = os.path.basename(fpath)
        group_id = fname.split("_")[0] if "_" in fname else fname.split(".")[0]
        rows.append([fpath, label, group_id])

df = pd.DataFrame(rows, columns=["filepath", "label", "group"])
print("\n Total samples loaded:", len(df))
print(df.head())

if len(df) == 0:
    raise ValueError(" No image files found. Check dataset structure or path.")

# ===============================
# STEP 3: Class distribution check
# ===============================
from collections import Counter

class_counts = Counter(df['label'])
print("\n Class distribution:")
for cls, count in class_counts.items():
    print(f"{cls}: {count}")

# ===============================
# STEP 4: Grouped train-test split (to avoid leakage)
# ===============================
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(df['filepath'], df['label'], groups=df['group']))

train_df = df.iloc[train_idx].reset_index(drop=True)
test_df = df.iloc[test_idx].reset_index(drop=True)

print("\n Split summary:")
print("Train size:", len(train_df))
print("Test size:", len(test_df))

# ===============================
# STEP 5: Augmentation Probe (Safe vs Harmful)
# ===============================
import torchvision.transforms as T
from PIL import Image
import matplotlib.pyplot as plt

# Pick a random training image
sample_path = train_df['filepath'].sample(1, random_state=42).iloc[0]
img = Image.open(sample_path)

augmentations = {
    "RandomCrop": T.RandomResizedCrop(size=244, scale=(0.8, 1.0)),
    "HorizontalFlip": T.RandomHorizontalFlip(p=1.0),
    "ColorJitter": T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    "GaussianBlur": T.GaussianBlur(kernel_size=5),
    "HeavyRotation": T.RandomRotation(degrees=90),  # potential harmful
}

fig, axes = plt.subplots(1, len(augmentations)+1, figsize=(18,5))
axes[0].imshow(img)
axes[0].set_title("Original")
axes[0].axis("off")

for i, (name, aug) in enumerate(augmentations.items(), 1):
    transformed = aug(img)
    axes[i].imshow(transformed)
    axes[i].set_title(name)
    axes[i].axis("off")

plt.tight_layout()
plt.show()

print("\n Augmentation probe complete — visually inspect which transforms help or hurt.")
