In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# ---------------------------
# Data Loading & Preprocessing
# ---------------------------
# Load your CSV file which contains the extracted features:
# avg_r, avg_g, avg_b, file_size_kb, contrast, sobel_edge_count, center_edge_count, surround_edge_count
df = pd.read_csv("features.csv")

# Create a derived feature for overall brightness.
df['brightness'] = (df['avg_r'] + df['avg_g'] + df['avg_b']) / 3

# Manually assign ground truth for the first 40 images:
# Rows 0-19 are 'clean' and rows 20-39 are 'dirty'
df.loc[:19, 'true_label'] = 'clean'
df.loc[20:39, 'true_label'] = 'dirty'

# ---------------------------
# Derive Thresholds & Normalization Stats from Labeled Data
# ---------------------------
# Choose the features we want to use.
feature_cols = [
    'brightness',
    'file_size_kb',
    'contrast',
    'sobel_edge_count',
    'center_edge_count',
    'surround_edge_count'
]

# For each feature, compute:
#   - The midpoint threshold between clean and dirty (using the first 40 examples)
#   - The standard deviation (over those 40 examples) for normalization.
#   - The “direction” of the feature: if clean images have a higher mean value than dirty images,
#     then a higher value votes for clean. Otherwise, a higher value votes for dirty.
stats = {}  # Dictionary: key = feature name; value = dict with threshold, std, direction.
labeled = df.loc[:39].copy()
print("Computed thresholds and stats from labeled examples:")
for feature in feature_cols:
    mean_clean = labeled[labeled['true_label'] == 'clean'][feature].mean()
    mean_dirty = labeled[labeled['true_label'] == 'dirty'][feature].mean()
    threshold = (mean_clean + mean_dirty) / 2.0
    std_f = labeled[feature].std(ddof=0)  # Population standard deviation
    # Determine the direction. If the average for 'clean' images is higher than 'dirty',
    # then a high value is evidence for clean, otherwise for dirty.
    if mean_clean > mean_dirty:
        direction = 'clean'
    else:
        direction = 'dirty'
    stats[feature] = {
         'threshold': threshold,
         'std': std_f,
         'direction': direction,
         'mean_clean': mean_clean,
         'mean_dirty': mean_dirty,
    }
    print(f"- {feature}: threshold = {threshold:.2f}, std = {std_f:.2f}, direction: {direction}")

# ---------------------------
# Enhanced Rule-Based Classification Using Weighted Z-Scores
# ---------------------------
def classify_image_z(row, stats, delta=0.0):
    """
    For each image, compute the normalized (z-score) difference from the threshold for each feature.
    If a feature is such that higher values indicate 'dirty', we reverse the z-score so that in all
    cases a positive contribution indicates evidence for 'clean' and a negative contribution indicates 'dirty'.
    
    The final score is the sum of contributions from all features.
      - If the total score > delta, label as 'clean'.
      - If the total score < -delta, label as 'dirty'.
      - Otherwise, label as 'uncertain'.
    """
    total_score = 0.0
    for feature in feature_cols:
        threshold = stats[feature]['threshold']
        std_f = stats[feature]['std']
        # Avoid division by zero. If std is zero, treat z as zero.
        if std_f == 0:
            z = 0.0
        else:
            z = (row[feature] - threshold) / std_f
        # If higher values favor dirty, reverse the sign.
        if stats[feature]['direction'] == 'dirty':
            total_score -= z
        else:
            total_score += z
    # Decision based on the summed score.
    if total_score > delta:
        return 'clean'
    elif total_score < -delta:
        return 'dirty'
    else:
        return 'uncertain'

# ---------------------------
# Apply the Enhanced Classifier & Evaluate
# ---------------------------
# Use the new classification function.
# You can adjust delta (a minimal margin) if you wish to favor 'uncertain' in borderline cases.
df['auto_label'] = df.apply(lambda row: classify_image_z(row, stats, delta=0.0), axis=1)

# Evaluate performance on the first 40 labeled images.
df_eval = df.loc[:39]
correct_predictions = (df_eval['auto_label'] == df_eval['true_label']).sum()
accuracy = correct_predictions / 40.0 * 100.0
print("\nEnhanced rule-based classifier accuracy on first 40 labeled images: "
      f"{accuracy:.2f}%\n")

print("Confusion Matrix:")
print(confusion_matrix(df_eval['true_label'], df_eval['auto_label'], 
                         labels=['clean', 'dirty', 'uncertain']))

print("\nClassification Report:")
print(classification_report(df_eval['true_label'], df_eval['auto_label'], 
                            labels=['clean', 'dirty']))

# Optionally, save the updated data with the new labels.
df.to_csv("features_with_enhanced_pattern_labels.csv", index=False)
print("\nEnhanced classification complete. Results saved in 'features_with_enhanced_pattern_labels.csv'.")


Computed thresholds and stats from labeled examples:
- brightness: threshold = 116.02, std = 12.45, direction: clean
- file_size_kb: threshold = 441.76, std = 1122.96, direction: dirty
- contrast: threshold = 245.73, std = 8.42, direction: clean
- sobel_edge_count: threshold = 8520.83, std = 1877.68, direction: dirty
- center_edge_count: threshold = 907.80, std = 239.98, direction: clean
- surround_edge_count: threshold = 2193.93, std = 699.16, direction: dirty

Enhanced rule-based classifier accuracy on first 40 labeled images: 75.00%

Confusion Matrix:
[[16  4  0]
 [ 6 14  0]
 [ 0  0  0]]

Classification Report:
              precision    recall  f1-score   support

       clean       0.73      0.80      0.76        20
       dirty       0.78      0.70      0.74        20

    accuracy                           0.75        40
   macro avg       0.75      0.75      0.75        40
weighted avg       0.75      0.75      0.75        40


Enhanced classification complete. Results saved in