In [1]:
"""
CORRECTED VERSION: Power Profile and Thresholding Assisted Multi-Label NILM Classification
Paper Replication with Dynamic Dataset Adaptation

Author: Research Replication Team
Date: 2024

This code correctly replicates the methodology from the paper:
"Power Profile and Thresholding Assisted Multi-Label NILM Classification"

Dataset path: /kaggle/input/redd-dataset/redd

CORRECTIONS APPLIED:
1. Dynamic dataset discovery instead of hardcoded channel mapping
2. House 2 support with actual appliance data
3. Proper power windowing based on actual data
4. Correct OPM thresholding implementation
"""

import os
import sys
import numpy as np
import pandas as pd
import glob
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
import joblib

# Set paths
DATA_PATH = "/kaggle/input/redd-dataset/redd"
OUTPUT_PATH = "/kaggle/working/"

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# ===================== CORRECTED CONFIGURATION =====================

class Config:
    """Dynamic configuration based on actual dataset"""
    
    HOUSE_NUMBER = 2  # Using House 2 as it has actual appliance data
    
    # Paper's power windows for reference (from Table 3)
    PAPER_POWER_WINDOWS = {
        'dish_washer': (30, 1200),
        'electric_stove': (1000, 1500),
        'fridge': (175, 500),
        'microwave': (20, 1650),
        'washer_dryer': (250, 700),
        'refrigerator': (175, 500),  # Same as fridge
        'stove': (1000, 1500),  # Same as electric_stove
        'kitchen_outlet': (10, 150),
        'lighting': (20, 400),
        'bathroom_gfi': (1500, 1700),
        'electric_heater': (1, 21),
    }
    
    # Threshold values (OPM) as in paper
    THRESHOLDS = [5, 10, 20, 30, 40, 50]
    
    # Experimental scenarios
    WINDOWING_SCENARIOS = [True, False]
    
    # Training parameters
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    
    # Use all samples for OPM calculation (critical for paper methodology)
    SAMPLE_SIZE = None  # None means use all data

# ===================== CORRECTED DATA LOADER =====================

class REDDDataLoader:
    """Loads and preprocesses REDD dataset with dynamic appliance discovery"""
    
    def __init__(self, data_path, house_number=2):
        self.data_path = data_path
        self.house_number = house_number
        
        # Discover actual appliances in dataset
        self.appliance_names = self.discover_appliances()
        self.power_windows = self.set_power_windows()
        
        print(f"Discovered {len(self.appliance_names)} appliances: {self.appliance_names}")
    
    def discover_appliances(self):
        """Discover actual appliances in the dataset files"""
        pattern = os.path.join(self.data_path, f"*house{self.house_number}_*.csv")
        csv_files = glob.glob(pattern)
        
        if not csv_files:
            raise FileNotFoundError(f"No CSV files found for house {self.house_number}")
        
        # Load first file to discover structure
        df_sample = pd.read_csv(csv_files[0])
        
        # Find appliance columns (not 'main' and not unnamed)
        appliance_cols = []
        for col in df_sample.columns:
            col_lower = str(col).lower().strip()
            if col_lower != 'main' and not col_lower.startswith('unnamed'):
                # Clean column name
                clean_name = col_lower.replace('#', '').replace(' ', '_').strip()
                if clean_name and clean_name != 'main':
                    appliance_cols.append(clean_name)
        
        # Remove duplicates and sort
        appliance_cols = sorted(list(set(appliance_cols)))
        
        return appliance_cols
    
    def set_power_windows(self):
        """Set power windows based on paper's values or data statistics"""
        windows = {}
        
        # Load sample data to get power statistics
        pattern = os.path.join(self.data_path, f"*house{self.house_number}_0.csv")
        csv_files = glob.glob(pattern)
        
        if csv_files:
            df_sample = pd.read_csv(csv_files[0])
            
            for appliance in self.appliance_names:
                # Try to find the column in the dataframe
                matching_cols = [col for col in df_sample.columns 
                               if str(col).lower().replace('#', '').replace(' ', '_').strip() == appliance]
                
                if matching_cols:
                    col_name = matching_cols[0]
                    power_data = df_sample[col_name]
                    
                    # Use paper's window if available, otherwise use data statistics
                    if appliance in Config.PAPER_POWER_WINDOWS:
                        windows[appliance] = Config.PAPER_POWER_WINDOWS[appliance]
                    else:
                        # Calculate from data: use 5th and 95th percentiles to remove outliers
                        min_power = max(0, power_data.quantile(0.05))
                        max_power = power_data.quantile(0.95)
                        
                        # Ensure minimum range
                        if max_power - min_power < 10:
                            max_power = min_power + 100  # Default range
                        
                        windows[appliance] = (float(min_power), float(max_power))
        
        return windows
    
    def _standardize_column_names(self, df):
        """Standardize column names across CSV files - CORRECTED for actual REDD format"""
        new_columns = {}
        
        for col in df.columns:
            col_str = str(col)
            col_lower = col_str.lower().strip()
            
            # Clean the column name
            clean_name = col_lower.replace('#', '').replace(' ', '_').replace('__', '_').strip()
            
            # Map to standard names
            if 'main' in clean_name or col_str == 'main':
                new_columns[col] = 'main'
            elif any(appliance in clean_name for appliance in self.appliance_names):
                # Find the matching appliance
                for appliance in self.appliance_names:
                    if appliance in clean_name:
                        new_columns[col] = appliance
                        break
                else:
                    new_columns[col] = clean_name
            else:
                new_columns[col] = clean_name
        
        df = df.rename(columns=new_columns)
        
        # Ensure all appliance columns exist
        for appliance in self.appliance_names:
            if appliance not in df.columns:
                df[appliance] = 0.0
        
        return df
    
    def load_house_data(self, sample_size=None):
        """Load all CSV files for a house and merge them correctly"""
        pattern = os.path.join(self.data_path, f"*house{self.house_number}_*.csv")
        csv_files = sorted(glob.glob(pattern))
        
        if not csv_files:
            raise FileNotFoundError(f"No CSV files found for house {self.house_number}")
        
        print(f"Found {len(csv_files)} files for house {self.house_number}")
        
        dfs = []
        
        for file in tqdm(csv_files, desc="Loading CSV files"):
            try:
                df = pd.read_csv(file)
                
                # Standardize column names
                df = self._standardize_column_names(df)
                
                # Ensure we have main column
                if 'main' not in df.columns:
                    print(f"Warning: 'main' not found in {file}, skipping")
                    continue
                
                # Handle missing values - fill with 0 for appliances
                for col in df.columns:
                    if col != 'main':
                        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
                
                dfs.append(df)
                
            except Exception as e:
                print(f"Error reading {file}: {str(e)[:100]}")
                continue
        
        if not dfs:
            raise ValueError("No valid data loaded")
        
        # Combine all dataframes
        combined_df = pd.concat(dfs, ignore_index=True)
        
        # Ensure numeric types
        for col in combined_df.columns:
            combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
        
        # Fill any remaining NaN values
        combined_df = combined_df.fillna(0)
        
        # Sample if requested
        if sample_size and len(combined_df) > sample_size:
            print(f"Note: Using {sample_size:,} samples out of {len(combined_df):,}")
            combined_df = combined_df.iloc[:sample_size]
        
        print(f"Loaded {len(combined_df):,} samples")
        print(f"Available columns: {list(combined_df.columns)}")
        
        return combined_df
    
    def apply_power_windowing(self, df, verbose=True):
        """Apply power windowing as per paper methodology - CORRECTED"""
        df_processed = df.copy()
        
        if verbose:
            print("\n" + "="*80)
            print("Applying Power Windowing (Corrected Implementation)")
            print("="*80)
        
        stats = []
        
        for appliance in self.appliance_names:
            if appliance not in df_processed.columns:
                continue
            
            power_data = df_processed[appliance]
            
            # Get power window for this appliance
            if appliance in self.power_windows:
                lower, upper = self.power_windows[appliance]
            else:
                # Default: use 5th and 95th percentiles
                lower = float(power_data.quantile(0.05))
                upper = float(power_data.quantile(0.95))
                self.power_windows[appliance] = (lower, upper)
            
            # Calculate states before windowing
            before_on = (power_data > 0).sum()
            
            # Create state column: 1 if within window, 0 otherwise
            state_series = ((power_data >= lower) & (power_data <= upper)).astype(int)
            df_processed[f"{appliance}_state"] = state_series
            
            # Store windowed power for analysis
            df_processed[f"{appliance}_windowed"] = power_data.copy()
            outside_window = ~((power_data >= lower) & (power_data <= upper))
            df_processed.loc[outside_window, f"{appliance}_windowed"] = 0
            
            after_on = state_series.sum()
            
            stats.append({
                'appliance': appliance,
                'window': (lower, upper),
                'before_on': before_on,
                'after_on': after_on,
                'changed': before_on - after_on,
                'pct_on_before': (before_on / len(df)) * 100,
                'pct_on_after': (after_on / len(df)) * 100
            })
        
        if verbose:
            print(f"\n{'Appliance':<20} {'Window (W)':<15} {'Before ON':<10} {'After ON':<10} {'Changed':<10} {'% ON Before':<12} {'% ON After':<12}")
            print("-"*90)
            
            for stat in stats:
                window_str = f"{stat['window'][0]:.0f}-{stat['window'][1]:.0f}"
                print(f"{stat['appliance']:<20} {window_str:<15} {stat['before_on']:<10,} {stat['after_on']:<10,} "
                      f"{stat['changed']:<10,} {stat['pct_on_before']:<12.1f} {stat['pct_on_after']:<12.1f}")
        
        return df_processed
    
    def create_binary_labels_no_windowing(self, df):
        """Create binary labels without power windowing (paper baseline) - CORRECTED"""
        df_labels = df.copy()
        
        for appliance in self.appliance_names:
            if appliance in df_labels.columns:
                # PAPER BASELINE: Simple ON/OFF (>0)
                df_labels[f"{appliance}_state"] = (df_labels[appliance] > 0).astype(int)
        
        return df_labels
    
    def create_multiclass_labels(self, df, threshold=5, verbose=True):
        """Create multiclass labels and apply thresholding (OPM) - CORRECTED"""
        
        # Get state columns
        state_columns = [col for col in df.columns if col.endswith('_state')]
        
        if not state_columns:
            raise ValueError("No state columns found. Run power windowing or binary labeling first.")
        
        # Sort to ensure consistent ordering
        state_columns = sorted(state_columns)
        
        # Create binary combination string (PAPER METHOD)
        df['binary_combination'] = df[state_columns].astype(str).agg(''.join, axis=1)
        
        # Apply thresholding (OPM)
        combination_counts = df['binary_combination'].value_counts()
        
        if verbose:
            print(f"\nThresholding Analysis (Threshold = {threshold}):")
            print(f"Total samples: {len(df):,}")
            print(f"Unique combinations before thresholding: {len(combination_counts)}")
        
        # Remove combinations below threshold
        valid_combinations = combination_counts[combination_counts >= threshold].index
        mask = df['binary_combination'].isin(valid_combinations)
        df_filtered = df[mask].copy()
        
        if verbose:
            print(f"Unique combinations after thresholding: {len(valid_combinations)}")
            print(f"Samples after thresholding: {len(df_filtered):,} ({len(df_filtered)/len(df)*100:.1f}%)")
            print(f"Removed samples: {len(df) - len(df_filtered):,}")
        
        # Show top combinations
        print("\nTop 10 appliance combinations:")
        top_combos = combination_counts.head(10)
        
        for combo, count in top_combos.items():
            # Map binary string to appliance names
            appliance_states = []
            for i, (app, state) in enumerate(zip(state_columns, combo)):
                app_name = app.replace('_state', '')
                appliance_states.append(f"{app_name[:3]}:{state}")
            
            state_str = "|".join(appliance_states)
            print(f"  {combo} : {state_str} : {count:7,d} occurrences")
        
        # Encode labels
        label_encoder = LabelEncoder()
        df_filtered['encoded_label'] = label_encoder.fit_transform(df_filtered['binary_combination'])
        
        # PAPER METHOD: Use only aggregate power as feature
        X = df_filtered[['main']].values.astype(np.float32)
        y = df_filtered['encoded_label'].values
        
        # Handle any NaN values
        X = np.nan_to_num(X, nan=0.0)
        
        # Get class distribution
        unique_classes, class_counts = np.unique(y, return_counts=True)
        
        if verbose:
            print(f"\nFinal dataset for ML:")
            print(f"  Features shape: {X.shape}")
            print(f"  Labels shape: {y.shape}")
            print(f"  Number of classes: {len(unique_classes)}")
            
            # Show class distribution
            print("\n  Class distribution (top 10):")
            sorted_indices = np.argsort(-class_counts)
            for i, idx in enumerate(sorted_indices[:10]):
                class_id = unique_classes[idx]
                count = class_counts[idx]
                combo = label_encoder.inverse_transform([class_id])[0]
                print(f"    Class {class_id:3d}: {count:7,d} samples - {combo}")
        
        return X, y, label_encoder, df_filtered

# ===================== MACHINE LEARNING CLASSIFIERS (UNCHANGED) =====================

class NILMClassifier:
    """Machine learning classifiers as per paper"""
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.classifiers = {}
        self.results = {}
        self.training_times = {}
        self._initialize_classifiers()
    
    def _initialize_classifiers(self):
        """Initialize classifiers with paper parameters (Table 2)"""
        self.classifiers = {
            'CART': DecisionTreeClassifier(
                criterion='gini',
                splitter='best',
                min_samples_split=2,
                min_samples_leaf=1,
                random_state=self.random_state
            ),
            'ET': ExtraTreesClassifier(
                n_estimators=100,
                criterion='gini',
                min_samples_split=2,
                min_samples_leaf=1,
                random_state=self.random_state,
                n_jobs=-1
            ),
            'KNN': KNeighborsClassifier(
                n_neighbors=5,
                weights='uniform',
                metric='minkowski',
                n_jobs=-1
            ),
            'KNN-CB': KNeighborsClassifier(
                n_neighbors=10,
                weights='distance',
                metric='manhattan',
                n_jobs=-1
            ),
            'LDA': LinearDiscriminantAnalysis(
                solver='svd',
                shrinkage=None,
                tol=1e-4
            ),
            'NB': GaussianNB(var_smoothing=1e-9),
            'RF': RandomForestClassifier(
                n_estimators=100,
                criterion='gini',
                min_samples_split=2,
                min_samples_leaf=1,
                random_state=self.random_state,
                n_jobs=-1
            )
        }
        print(f"Initialized {len(self.classifiers)} classifiers")
    
    def train_and_evaluate(self, X_train, X_test, y_train, y_test):
        """Train and evaluate all classifiers"""
        print("\n" + "="*60)
        print("Training and Evaluating Classifiers")
        print("="*60)
        
        # Scale features (aggregate power)
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Ensure no NaN
        X_train_scaled = np.nan_to_num(X_train_scaled, nan=0.0)
        X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0)
        
        self.results = {}
        self.training_times = {}
        
        for name, clf in tqdm(self.classifiers.items(), desc="Training classifiers"):
            try:
                # Training
                start_time = time.time()
                clf.fit(X_train_scaled, y_train)
                train_time = time.time() - start_time
                self.training_times[name] = train_time
                
                # Predictions
                y_pred = clf.predict(X_test_scaled)
                
                # Calculate metrics (PAPER METRICS)
                results = {
                    'accuracy': accuracy_score(y_test, y_pred),
                    'precision_macro': precision_score(y_test, y_pred, average='macro', zero_division=0),
                    'recall_macro': recall_score(y_test, y_pred, average='macro', zero_division=0),
                    'f1_macro': f1_score(y_test, y_pred, average='macro', zero_division=0),
                    'precision_weighted': precision_score(y_test, y_pred, average='weighted', zero_division=0),
                    'recall_weighted': recall_score(y_test, y_pred, average='weighted', zero_division=0),
                    'f1_weighted': f1_score(y_test, y_pred, average='weighted', zero_division=0),
                    'training_time': train_time,
                    'n_classes': len(np.unique(y_train))
                }
                
                self.results[name] = results
                
                print(f"\n✓ {name}: Trained in {train_time:.2f}s")
                print(f"  Macro F1: {results['f1_macro']:.4f}")
                print(f"  Weighted F1: {results['f1_weighted']:.4f}")
                print(f"  Accuracy: {results['accuracy']:.4f}")
                
            except Exception as e:
                print(f"\n✗ Error with {name}: {str(e)[:100]}")
                self.results[name] = None
        
        return self.results

# ===================== CORRECTED EXPERIMENT PIPELINE =====================

def run_experiment_pipeline(data_path, house_number=2, threshold=10,
                           apply_windowing=True, sample_size=None):
    """
    Complete experiment pipeline for one configuration - CORRECTED
    
    Args:
        data_path: Path to REDD dataset
        house_number: House number to process
        threshold: OPM threshold value
        apply_windowing: Whether to apply power windowing
        sample_size: Number of samples to use (None for all)
    
    Returns:
        Dictionary with experiment results
    """
    print(f"\n{'#'*80}")
    print(f"EXPERIMENT: House {house_number}, Threshold={threshold}, "
          f"Windowing={'ON' if apply_windowing else 'OFF'}")
    print(f"{'#'*80}")
    
    # Step 1: Initialize loader and discover actual data
    loader = REDDDataLoader(data_path, house_number)
    
    # Step 2: Load data
    df = loader.load_house_data(sample_size=sample_size)
    
    # Step 3: Apply power windowing or binary labeling
    if apply_windowing:
        df_processed = loader.apply_power_windowing(df, verbose=True)
    else:
        df_processed = loader.create_binary_labels_no_windowing(df)
        print("\nUsing binary labeling without power windowing (paper baseline)")
    
    # Step 4: Create multiclass labels with thresholding
    try:
        X, y, label_encoder, df_filtered = loader.create_multiclass_labels(
            df_processed, threshold=threshold, verbose=True
        )
    except ValueError as e:
        print(f"\nError creating multiclass labels: {e}")
        print("Skipping this configuration...")
        return None
    
    # Step 5: Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=Config.TEST_SIZE,
        random_state=Config.RANDOM_STATE,
        stratify=y
    )
    
    print(f"\nData split:")
    print(f"  Training samples: {len(X_train):,}")
    print(f"  Testing samples: {len(X_test):,}")
    print(f"  Feature dimension: {X.shape[1]}")
    print(f"  Number of classes: {len(np.unique(y))}")
    
    # Step 6: Train and evaluate classifiers
    classifier = NILMClassifier(random_state=Config.RANDOM_STATE)
    results = classifier.train_and_evaluate(X_train, X_test, y_train, y_test)
    
    # Step 7: Plot results
    exp_name = f"House{house_number}_Th{threshold}_Windowing{apply_windowing}"
    
    # Step 8: Save results
    results_summary = {
        'config': {
            'house_number': house_number,
            'threshold': threshold,
            'windowing': apply_windowing,
            'n_samples': len(X),
            'n_classes': len(np.unique(y)),
            'n_train': len(X_train),
            'n_test': len(X_test),
            'appliances': loader.appliance_names,
            'power_windows': loader.power_windows
        },
        'results': results,
        'label_encoder': label_encoder
    }
    
    # Save results
    results_file = os.path.join(OUTPUT_PATH, f'results_{exp_name}.pkl')
    joblib.dump(results_summary, results_file)
    print(f"\nResults saved to: {results_file}")
    
    return results_summary

# ===================== DATASET ANALYSIS FUNCTIONS =====================

def analyze_dataset_structure(data_path):
    """Analyze the structure of the REDD dataset"""
    print("\n" + "="*80)
    print("REDD DATASET STRUCTURE ANALYSIS")
    print("="*80)
    
    # List all houses
    pattern = os.path.join(data_path, "*.csv")
    all_files = glob.glob(pattern)
    
    houses = set()
    for file in all_files:
        filename = os.path.basename(file)
        if 'house' in filename:
            # Extract house number
            parts = filename.split('_')
            for part in parts:
                if part.startswith('house'):
                    house_num = part.replace('house', '')
                    if house_num.isdigit():
                        houses.add(int(house_num))
    
    print(f"Houses found: {sorted(houses)}")
    
    # Analyze each house
    for house in sorted(houses)[:3]:  # Analyze first 3 houses
        print(f"\n{'='*40}")
        print(f"ANALYZING HOUSE {house}")
        print(f"{'='*40}")
        
        pattern = os.path.join(data_path, f"*house{house}_*.csv")
        house_files = sorted(glob.glob(pattern))
        
        if not house_files:
            print(f"  No files found for house {house}")
            continue
        
        print(f"  Found {len(house_files)} files")
        
        # Check first file
        first_file = house_files[0]
        df = pd.read_csv(first_file)
        
        print(f"\n  First file: {os.path.basename(first_file)}")
        print(f"  Shape: {df.shape}")
        print(f"  Columns: {list(df.columns)}")
        
        # Check for appliance data
        appliance_cols = []
        for col in df.columns:
            col_lower = str(col).lower()
            if 'main' not in col_lower and not col_lower.startswith('unnamed'):
                appliance_cols.append(col)
        
        print(f"  Appliance columns ({len(appliance_cols)}): {appliance_cols}")
        
        # Show data sample
        print(f"\n  Data sample (first 5 rows):")
        print(df.head().T)

def run_house_experiment(house_number=2, sample_size=100000):
    """Run experiment for a specific house"""
    print("\n" + "="*80)
    print(f"RUNNING EXPERIMENT FOR HOUSE {house_number}")
    print("="*80)
    
    all_results = {}
    
    # Test different configurations
    for apply_windowing in Config.WINDOWING_SCENARIOS:
        scenario_name = "With_Windowing" if apply_windowing else "Without_Windowing"
        all_results[scenario_name] = {}
        
        for threshold in Config.THRESHOLDS:
            print(f"\n{'='*60}")
            print(f"Configuration: {scenario_name}, Threshold={threshold}")
            print(f"{'='*60}")
            
            # Run experiment
            results = run_experiment_pipeline(
                data_path=DATA_PATH,
                house_number=house_number,
                threshold=threshold,
                apply_windowing=apply_windowing,
                sample_size=sample_size
            )
            
            if results:
                all_results[scenario_name][threshold] = results
    
    # Generate summary
    print("\n" + "#"*80)
    print("EXPERIMENT SUMMARY")
    print("#"*80)
    
    for scenario in all_results:
        print(f"\n{scenario}:")
        print("-"*40)
        
        for threshold, result in all_results[scenario].items():
            if result and 'results' in result:
                # Find best classifier by macro F1
                best_clf = None
                best_f1 = 0
                
                if result['results']:
                    for clf_name, clf_results in result['results'].items():
                        if clf_results and clf_results['f1_macro'] > best_f1:
                            best_f1 = clf_results['f1_macro']
                            best_clf = clf_name
                
                if best_clf:
                    print(f"  Threshold {threshold:2d}: "
                          f"Best = {best_clf:8s}, "
                          f"Macro F1 = {best_f1:.4f}, "
                          f"Classes = {result['config']['n_classes']:3d}, "
                          f"Samples = {result['config']['n_samples']:,}")
    
    # Save all results
    all_results_file = os.path.join(OUTPUT_PATH, f'all_results_house{house_number}.pkl')
    joblib.dump(all_results, all_results_file)
    print(f"\nAll results saved to: {all_results_file}")
    
    return all_results

# ===================== MAIN EXECUTION =====================

if __name__ == "__main__":
    print("="*80)
    print("NILM CLASSIFICATION - CORRECTED REPLICATION")
    print("="*80)
    
    # Step 1: Analyze dataset structure
    analyze_dataset_structure(DATA_PATH)
    
    # Step 2: Run experiment for House 2
    results = run_house_experiment(house_number=2, sample_size=100000)
    
    print(f"\nOutput files saved to: {OUTPUT_PATH}")
    print("\nGenerated files:")
    
    for file in os.listdir(OUTPUT_PATH):
        if file.endswith(('.png', '.pkl', '.csv')):
            print(f"  {file}")
    
    print("\n" + "="*80)
    print("EXPERIMENT COMPLETED SUCCESSFULLY!")
    print("="*80)

NILM CLASSIFICATION - CORRECTED REPLICATION

REDD DATASET STRUCTURE ANALYSIS
Houses found: [1, 2, 3, 4, 5, 6]

ANALYZING HOUSE 1
  Found 11 files

  First file: redd_house1_0.csv
  Shape: (23302, 8)
  Columns: ['Unnamed: 0', 'dish washer', 'electric space heater', 'electric stove', 'fridge', 'microwave', 'washer dryer', 'main']
  Appliance columns (6): ['dish washer', 'electric space heater', 'electric stove', 'fridge', 'microwave', 'washer dryer']

  Data sample (first 5 rows):
                                0          1          2          3          4
Unnamed: 0               0.000000   1.000000   2.000000   3.000000   4.000000
dish washer              0.000000   0.000000   0.000000   0.000000   0.000000
electric space heater    0.000000   0.000000   0.000000   0.000000   0.000000
electric stove           0.000000   0.000000   0.000000   0.000000   0.000000
fridge                   6.000000   6.000000   6.000000   6.000000   6.000000
microwave                4.000000   4.000000   4

Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 23.68it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Applying Power Windowing (Corrected Implementation)

Appliance            Window (W)      Before ON  After ON   Changed    % ON Before  % ON After  
------------------------------------------------------------------------------------------
dish_washer          30-1200         9,898      1,353      8,545      9.9          1.4         
electric_stove       1000-1500       48,019     0          48,019     48.0         0.0         
fridge               175-500         99,997     3,864      96,133     100.0        3.9         
microwave            20-1650         100,000    15,980     84,020     100.0        16.0        
washer_dryer         250-700         98,684     0          98,684     98.7         0.0         
waste_disposal_unit  0-100           1,735      99,994     -98,259    1.7   

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  5.78it/s]


✓ CART: Trained in 0.11s
  Macro F1: 0.5539
  Weighted F1: 0.8807
  Accuracy: 0.8851


Training classifiers:  29%|██▊       | 2/7 [00:01<00:04,  1.04it/s]


✓ ET: Trained in 1.27s
  Macro F1: 0.5563
  Weighted F1: 0.8816
  Accuracy: 0.8857


Training classifiers:  57%|█████▋    | 4/7 [00:02<00:01,  1.60it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.5585
  Weighted F1: 0.8861
  Accuracy: 0.8936

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.5504
  Weighted F1: 0.8869
  Accuracy: 0.8919


Training classifiers:  71%|███████▏  | 5/7 [00:02<00:00,  2.27it/s]


✓ LDA: Trained in 0.04s
  Macro F1: 0.1592
  Weighted F1: 0.7024
  Accuracy: 0.7815

✓ NB: Trained in 0.01s
  Macro F1: 0.1603
  Weighted F1: 0.7027
  Accuracy: 0.7815


Training classifiers: 100%|██████████| 7/7 [00:06<00:00,  1.06it/s]



✓ RF: Trained in 3.47s
  Macro F1: 0.5576
  Weighted F1: 0.8817
  Accuracy: 0.8842

Results saved to: /kaggle/working/results_House2_Th5_WindowingTrue.pkl

Configuration: With_Windowing, Threshold=10

################################################################################
EXPERIMENT: House 2, Threshold=10, Windowing=ON
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 39.62it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Applying Power Windowing (Corrected Implementation)

Appliance            Window (W)      Before ON  After ON   Changed    % ON Before  % ON After  
------------------------------------------------------------------------------------------
dish_washer          30-1200         9,898      1,353      8,545      9.9          1.4         
electric_stove       1000-1500       48,019     0          48,019     48.0         0.0         
fridge               175-500         99,997     3,864      96,133     100.0        3.9         
microwave            20-1650         100,000    15,980     84,020     100.0        16.0        
washer_dryer         250-700         98,684     0          98,684     98.7         0.0         
waste_disposal_unit  0-100           1,735      99,994     -98,259    1.7   

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  5.48it/s]


✓ CART: Trained in 0.12s
  Macro F1: 0.6456
  Weighted F1: 0.8804
  Accuracy: 0.8847


Training classifiers:  29%|██▊       | 2/7 [00:01<00:04,  1.06it/s]


✓ ET: Trained in 1.26s
  Macro F1: 0.6477
  Weighted F1: 0.8813
  Accuracy: 0.8855


Training classifiers:  57%|█████▋    | 4/7 [00:02<00:01,  1.61it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.6525
  Weighted F1: 0.8867
  Accuracy: 0.8941

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.6416
  Weighted F1: 0.8865
  Accuracy: 0.8913


Training classifiers:  86%|████████▌ | 6/7 [00:02<00:00,  3.02it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.1854
  Weighted F1: 0.7023
  Accuracy: 0.7813

✓ NB: Trained in 0.01s
  Macro F1: 0.1868
  Weighted F1: 0.7026
  Accuracy: 0.7813


Training classifiers: 100%|██████████| 7/7 [00:06<00:00,  1.07it/s]



✓ RF: Trained in 3.51s
  Macro F1: 0.6474
  Weighted F1: 0.8808
  Accuracy: 0.8834

Results saved to: /kaggle/working/results_House2_Th10_WindowingTrue.pkl

Configuration: With_Windowing, Threshold=20

################################################################################
EXPERIMENT: House 2, Threshold=20, Windowing=ON
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 40.00it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Applying Power Windowing (Corrected Implementation)

Appliance            Window (W)      Before ON  After ON   Changed    % ON Before  % ON After  
------------------------------------------------------------------------------------------
dish_washer          30-1200         9,898      1,353      8,545      9.9          1.4         
electric_stove       1000-1500       48,019     0          48,019     48.0         0.0         
fridge               175-500         99,997     3,864      96,133     100.0        3.9         
microwave            20-1650         100,000    15,980     84,020     100.0        16.0        
washer_dryer         250-700         98,684     0          98,684     98.7         0.0         
waste_disposal_unit  0-100           1,735      99,994     -98,259    1.7   

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  5.82it/s]


✓ CART: Trained in 0.11s
  Macro F1: 0.6456
  Weighted F1: 0.8804
  Accuracy: 0.8847


Training classifiers:  29%|██▊       | 2/7 [00:01<00:04,  1.06it/s]


✓ ET: Trained in 1.26s
  Macro F1: 0.6477
  Weighted F1: 0.8813
  Accuracy: 0.8855


Training classifiers:  57%|█████▋    | 4/7 [00:02<00:01,  1.62it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.6525
  Weighted F1: 0.8867
  Accuracy: 0.8941

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.6416
  Weighted F1: 0.8865
  Accuracy: 0.8913


Training classifiers:  86%|████████▌ | 6/7 [00:02<00:00,  3.03it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.1854
  Weighted F1: 0.7023
  Accuracy: 0.7813

✓ NB: Trained in 0.01s
  Macro F1: 0.1868
  Weighted F1: 0.7026
  Accuracy: 0.7813


Training classifiers: 100%|██████████| 7/7 [00:06<00:00,  1.08it/s]



✓ RF: Trained in 3.46s
  Macro F1: 0.6474
  Weighted F1: 0.8808
  Accuracy: 0.8834

Results saved to: /kaggle/working/results_House2_Th20_WindowingTrue.pkl

Configuration: With_Windowing, Threshold=30

################################################################################
EXPERIMENT: House 2, Threshold=30, Windowing=ON
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 38.10it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Applying Power Windowing (Corrected Implementation)

Appliance            Window (W)      Before ON  After ON   Changed    % ON Before  % ON After  
------------------------------------------------------------------------------------------
dish_washer          30-1200         9,898      1,353      8,545      9.9          1.4         
electric_stove       1000-1500       48,019     0          48,019     48.0         0.0         
fridge               175-500         99,997     3,864      96,133     100.0        3.9         
microwave            20-1650         100,000    15,980     84,020     100.0        16.0        
washer_dryer         250-700         98,684     0          98,684     98.7         0.0         
waste_disposal_unit  0-100           1,735      99,994     -98,259    1.7   

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  5.93it/s]


✓ CART: Trained in 0.11s
  Macro F1: 0.6427
  Weighted F1: 0.8806
  Accuracy: 0.8849


Training classifiers:  29%|██▊       | 2/7 [00:01<00:04,  1.07it/s]


✓ ET: Trained in 1.26s
  Macro F1: 0.6461
  Weighted F1: 0.8812
  Accuracy: 0.8853


Training classifiers:  57%|█████▋    | 4/7 [00:02<00:01,  1.62it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.6504
  Weighted F1: 0.8865
  Accuracy: 0.8938

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.6641
  Weighted F1: 0.8870
  Accuracy: 0.8918


Training classifiers:  86%|████████▌ | 6/7 [00:02<00:00,  3.05it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.2200
  Weighted F1: 0.7023
  Accuracy: 0.7816

✓ NB: Trained in 0.01s
  Macro F1: 0.2244
  Weighted F1: 0.7028
  Accuracy: 0.7815


Training classifiers: 100%|██████████| 7/7 [00:06<00:00,  1.08it/s]



✓ RF: Trained in 3.46s
  Macro F1: 0.6456
  Weighted F1: 0.8810
  Accuracy: 0.8836

Results saved to: /kaggle/working/results_House2_Th30_WindowingTrue.pkl

Configuration: With_Windowing, Threshold=40

################################################################################
EXPERIMENT: House 2, Threshold=40, Windowing=ON
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 40.78it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Applying Power Windowing (Corrected Implementation)

Appliance            Window (W)      Before ON  After ON   Changed    % ON Before  % ON After  
------------------------------------------------------------------------------------------
dish_washer          30-1200         9,898      1,353      8,545      9.9          1.4         
electric_stove       1000-1500       48,019     0          48,019     48.0         0.0         
fridge               175-500         99,997     3,864      96,133     100.0        3.9         
microwave            20-1650         100,000    15,980     84,020     100.0        16.0        
washer_dryer         250-700         98,684     0          98,684     98.7         0.0         
waste_disposal_unit  0-100           1,735      99,994     -98,259    1.7   

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  5.81it/s]


✓ CART: Trained in 0.11s
  Macro F1: 0.6427
  Weighted F1: 0.8806
  Accuracy: 0.8849


Training classifiers:  29%|██▊       | 2/7 [00:01<00:04,  1.06it/s]


✓ ET: Trained in 1.26s
  Macro F1: 0.6461
  Weighted F1: 0.8812
  Accuracy: 0.8853


Training classifiers:  57%|█████▋    | 4/7 [00:02<00:01,  1.60it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.6504
  Weighted F1: 0.8865
  Accuracy: 0.8938

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.6641
  Weighted F1: 0.8870
  Accuracy: 0.8918


Training classifiers:  86%|████████▌ | 6/7 [00:02<00:00,  3.01it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.2200
  Weighted F1: 0.7023
  Accuracy: 0.7816

✓ NB: Trained in 0.01s
  Macro F1: 0.2244
  Weighted F1: 0.7028
  Accuracy: 0.7815


Training classifiers: 100%|██████████| 7/7 [00:06<00:00,  1.07it/s]



✓ RF: Trained in 3.45s
  Macro F1: 0.6456
  Weighted F1: 0.8810
  Accuracy: 0.8836

Results saved to: /kaggle/working/results_House2_Th40_WindowingTrue.pkl

Configuration: With_Windowing, Threshold=50

################################################################################
EXPERIMENT: House 2, Threshold=50, Windowing=ON
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 38.82it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Applying Power Windowing (Corrected Implementation)

Appliance            Window (W)      Before ON  After ON   Changed    % ON Before  % ON After  
------------------------------------------------------------------------------------------
dish_washer          30-1200         9,898      1,353      8,545      9.9          1.4         
electric_stove       1000-1500       48,019     0          48,019     48.0         0.0         
fridge               175-500         99,997     3,864      96,133     100.0        3.9         
microwave            20-1650         100,000    15,980     84,020     100.0        16.0        
washer_dryer         250-700         98,684     0          98,684     98.7         0.0         
waste_disposal_unit  0-100           1,735      99,994     -98,259    1.7   

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  5.93it/s]


✓ CART: Trained in 0.11s
  Macro F1: 0.6427
  Weighted F1: 0.8806
  Accuracy: 0.8849


Training classifiers:  29%|██▊       | 2/7 [00:01<00:04,  1.09it/s]


✓ ET: Trained in 1.22s
  Macro F1: 0.6461
  Weighted F1: 0.8812
  Accuracy: 0.8853


Training classifiers:  57%|█████▋    | 4/7 [00:02<00:01,  1.69it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.6504
  Weighted F1: 0.8865
  Accuracy: 0.8938

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.6641
  Weighted F1: 0.8870
  Accuracy: 0.8918


Training classifiers:  86%|████████▌ | 6/7 [00:02<00:00,  3.16it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.2200
  Weighted F1: 0.7023
  Accuracy: 0.7816

✓ NB: Trained in 0.01s
  Macro F1: 0.2244
  Weighted F1: 0.7028
  Accuracy: 0.7815


Training classifiers: 100%|██████████| 7/7 [00:06<00:00,  1.09it/s]



✓ RF: Trained in 3.51s
  Macro F1: 0.6456
  Weighted F1: 0.8810
  Accuracy: 0.8836

Results saved to: /kaggle/working/results_House2_Th50_WindowingTrue.pkl

Configuration: Without_Windowing, Threshold=5

################################################################################
EXPERIMENT: House 2, Threshold=5, Windowing=OFF
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 39.22it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Using binary labeling without power windowing (paper baseline)

Thresholding Analysis (Threshold = 5):
Total samples: 100,000
Unique combinations before thresholding: 14
Unique combinations after thresholding: 10
Samples after thresholding: 99,992 (100.0%)
Removed samples: 8

Top 10 appliance combinations:
  001110 : dis:0|ele:0|fri:1|mic:1|was:1|was:0 :  45,680 occurrences
  011110 : dis:0|ele:1|fri:1|mic:1|was:1|was:0 :  43,169 occurrences
  111110 : dis:1|ele:1|fri:1|mic:1|was:1|was:0 :   4,080 occurrences
  101110 : dis:1|ele:0|fri:1|mic:1|was:1|was:0 :   4,018 occurrences
  101111 : dis:1|ele:0|fri:1|mic:1|was:1|was:1 :   1,200 occurrences
  001100 : dis:0|ele:0|fri:1|mic:1|was:0|was:0 :   1,029 occurrences
  111111 : dis:1|ele:1|fri:1|mic:1|was:1|was:1 :     530 occurrences
  011

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  4.88it/s]


✓ CART: Trained in 0.15s
  Macro F1: 0.3359
  Weighted F1: 0.7336
  Accuracy: 0.7503


Training classifiers:  29%|██▊       | 2/7 [00:02<00:06,  1.24s/it]


✓ ET: Trained in 1.68s
  Macro F1: 0.3582
  Weighted F1: 0.7350
  Accuracy: 0.7517


Training classifiers:  57%|█████▋    | 4/7 [00:03<00:02,  1.40it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.3124
  Weighted F1: 0.7257
  Accuracy: 0.7504

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.3763
  Weighted F1: 0.7424
  Accuracy: 0.7598


Training classifiers:  86%|████████▌ | 6/7 [00:03<00:00,  2.70it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.1902
  Weighted F1: 0.7125
  Accuracy: 0.7477

✓ NB: Trained in 0.01s
  Macro F1: 0.1888
  Weighted F1: 0.7119
  Accuracy: 0.7465


Training classifiers: 100%|██████████| 7/7 [00:08<00:00,  1.16s/it]



✓ RF: Trained in 4.49s
  Macro F1: 0.3373
  Weighted F1: 0.7347
  Accuracy: 0.7507

Results saved to: /kaggle/working/results_House2_Th5_WindowingFalse.pkl

Configuration: Without_Windowing, Threshold=10

################################################################################
EXPERIMENT: House 2, Threshold=10, Windowing=OFF
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 39.53it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Using binary labeling without power windowing (paper baseline)

Thresholding Analysis (Threshold = 10):
Total samples: 100,000
Unique combinations before thresholding: 14
Unique combinations after thresholding: 10
Samples after thresholding: 99,992 (100.0%)
Removed samples: 8

Top 10 appliance combinations:
  001110 : dis:0|ele:0|fri:1|mic:1|was:1|was:0 :  45,680 occurrences
  011110 : dis:0|ele:1|fri:1|mic:1|was:1|was:0 :  43,169 occurrences
  111110 : dis:1|ele:1|fri:1|mic:1|was:1|was:0 :   4,080 occurrences
  101110 : dis:1|ele:0|fri:1|mic:1|was:1|was:0 :   4,018 occurrences
  101111 : dis:1|ele:0|fri:1|mic:1|was:1|was:1 :   1,200 occurrences
  001100 : dis:0|ele:0|fri:1|mic:1|was:0|was:0 :   1,029 occurrences
  111111 : dis:1|ele:1|fri:1|mic:1|was:1|was:1 :     530 occurrences
  01

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  4.87it/s]


✓ CART: Trained in 0.15s
  Macro F1: 0.3359
  Weighted F1: 0.7336
  Accuracy: 0.7503


Training classifiers:  29%|██▊       | 2/7 [00:02<00:06,  1.25s/it]


✓ ET: Trained in 1.71s
  Macro F1: 0.3582
  Weighted F1: 0.7350
  Accuracy: 0.7517


Training classifiers:  57%|█████▋    | 4/7 [00:03<00:02,  1.39it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.3124
  Weighted F1: 0.7257
  Accuracy: 0.7504

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.3763
  Weighted F1: 0.7424
  Accuracy: 0.7598


Training classifiers:  86%|████████▌ | 6/7 [00:03<00:00,  2.69it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.1902
  Weighted F1: 0.7125
  Accuracy: 0.7477

✓ NB: Trained in 0.01s
  Macro F1: 0.1888
  Weighted F1: 0.7119
  Accuracy: 0.7465


Training classifiers: 100%|██████████| 7/7 [00:08<00:00,  1.15s/it]



✓ RF: Trained in 4.45s
  Macro F1: 0.3373
  Weighted F1: 0.7347
  Accuracy: 0.7507

Results saved to: /kaggle/working/results_House2_Th10_WindowingFalse.pkl

Configuration: Without_Windowing, Threshold=20

################################################################################
EXPERIMENT: House 2, Threshold=20, Windowing=OFF
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 39.34it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Using binary labeling without power windowing (paper baseline)

Thresholding Analysis (Threshold = 20):
Total samples: 100,000
Unique combinations before thresholding: 14
Unique combinations after thresholding: 9
Samples after thresholding: 99,974 (100.0%)
Removed samples: 26

Top 10 appliance combinations:
  001110 : dis:0|ele:0|fri:1|mic:1|was:1|was:0 :  45,680 occurrences
  011110 : dis:0|ele:1|fri:1|mic:1|was:1|was:0 :  43,169 occurrences
  111110 : dis:1|ele:1|fri:1|mic:1|was:1|was:0 :   4,080 occurrences
  101110 : dis:1|ele:0|fri:1|mic:1|was:1|was:0 :   4,018 occurrences
  101111 : dis:1|ele:0|fri:1|mic:1|was:1|was:1 :   1,200 occurrences
  001100 : dis:0|ele:0|fri:1|mic:1|was:0|was:0 :   1,029 occurrences
  111111 : dis:1|ele:1|fri:1|mic:1|was:1|was:1 :     530 occurrences
  01

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  4.52it/s]


✓ CART: Trained in 0.16s
  Macro F1: 0.3735
  Weighted F1: 0.7338
  Accuracy: 0.7503


Training classifiers:  29%|██▊       | 2/7 [00:02<00:06,  1.24s/it]


✓ ET: Trained in 1.67s
  Macro F1: 0.3789
  Weighted F1: 0.7355
  Accuracy: 0.7521


Training classifiers:  57%|█████▋    | 4/7 [00:03<00:02,  1.40it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.3434
  Weighted F1: 0.7279
  Accuracy: 0.7512

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.4222
  Weighted F1: 0.7431
  Accuracy: 0.7604


Training classifiers:  86%|████████▌ | 6/7 [00:03<00:00,  2.71it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.2113
  Weighted F1: 0.7127
  Accuracy: 0.7478

✓ NB: Trained in 0.01s
  Macro F1: 0.2098
  Weighted F1: 0.7122
  Accuracy: 0.7467


Training classifiers: 100%|██████████| 7/7 [00:07<00:00,  1.14s/it]



✓ RF: Trained in 4.41s
  Macro F1: 0.3689
  Weighted F1: 0.7339
  Accuracy: 0.7497

Results saved to: /kaggle/working/results_House2_Th20_WindowingFalse.pkl

Configuration: Without_Windowing, Threshold=30

################################################################################
EXPERIMENT: House 2, Threshold=30, Windowing=OFF
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 40.22it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Using binary labeling without power windowing (paper baseline)

Thresholding Analysis (Threshold = 30):
Total samples: 100,000
Unique combinations before thresholding: 14
Unique combinations after thresholding: 9
Samples after thresholding: 99,974 (100.0%)
Removed samples: 26

Top 10 appliance combinations:
  001110 : dis:0|ele:0|fri:1|mic:1|was:1|was:0 :  45,680 occurrences
  011110 : dis:0|ele:1|fri:1|mic:1|was:1|was:0 :  43,169 occurrences
  111110 : dis:1|ele:1|fri:1|mic:1|was:1|was:0 :   4,080 occurrences
  101110 : dis:1|ele:0|fri:1|mic:1|was:1|was:0 :   4,018 occurrences
  101111 : dis:1|ele:0|fri:1|mic:1|was:1|was:1 :   1,200 occurrences
  001100 : dis:0|ele:0|fri:1|mic:1|was:0|was:0 :   1,029 occurrences
  111111 : dis:1|ele:1|fri:1|mic:1|was:1|was:1 :     530 occurrences
  01

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  4.97it/s]


✓ CART: Trained in 0.14s
  Macro F1: 0.3735
  Weighted F1: 0.7338
  Accuracy: 0.7503


Training classifiers:  29%|██▊       | 2/7 [00:02<00:06,  1.25s/it]


✓ ET: Trained in 1.69s
  Macro F1: 0.3789
  Weighted F1: 0.7355
  Accuracy: 0.7521


Training classifiers:  57%|█████▋    | 4/7 [00:03<00:02,  1.39it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.3434
  Weighted F1: 0.7279
  Accuracy: 0.7512

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.4222
  Weighted F1: 0.7431
  Accuracy: 0.7604


Training classifiers:  86%|████████▌ | 6/7 [00:03<00:00,  2.69it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.2113
  Weighted F1: 0.7127
  Accuracy: 0.7478

✓ NB: Trained in 0.01s
  Macro F1: 0.2098
  Weighted F1: 0.7122
  Accuracy: 0.7467


Training classifiers: 100%|██████████| 7/7 [00:08<00:00,  1.15s/it]



✓ RF: Trained in 4.41s
  Macro F1: 0.3689
  Weighted F1: 0.7339
  Accuracy: 0.7497

Results saved to: /kaggle/working/results_House2_Th30_WindowingFalse.pkl

Configuration: Without_Windowing, Threshold=40

################################################################################
EXPERIMENT: House 2, Threshold=40, Windowing=OFF
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 37.88it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Using binary labeling without power windowing (paper baseline)

Thresholding Analysis (Threshold = 40):
Total samples: 100,000
Unique combinations before thresholding: 14
Unique combinations after thresholding: 9
Samples after thresholding: 99,974 (100.0%)
Removed samples: 26

Top 10 appliance combinations:
  001110 : dis:0|ele:0|fri:1|mic:1|was:1|was:0 :  45,680 occurrences
  011110 : dis:0|ele:1|fri:1|mic:1|was:1|was:0 :  43,169 occurrences
  111110 : dis:1|ele:1|fri:1|mic:1|was:1|was:0 :   4,080 occurrences
  101110 : dis:1|ele:0|fri:1|mic:1|was:1|was:0 :   4,018 occurrences
  101111 : dis:1|ele:0|fri:1|mic:1|was:1|was:1 :   1,200 occurrences
  001100 : dis:0|ele:0|fri:1|mic:1|was:0|was:0 :   1,029 occurrences
  111111 : dis:1|ele:1|fri:1|mic:1|was:1|was:1 :     530 occurrences
  01

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  4.99it/s]


✓ CART: Trained in 0.14s
  Macro F1: 0.3735
  Weighted F1: 0.7338
  Accuracy: 0.7503


Training classifiers:  29%|██▊       | 2/7 [00:02<00:06,  1.23s/it]


✓ ET: Trained in 1.64s
  Macro F1: 0.3789
  Weighted F1: 0.7355
  Accuracy: 0.7521


Training classifiers:  57%|█████▋    | 4/7 [00:03<00:02,  1.38it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.3434
  Weighted F1: 0.7279
  Accuracy: 0.7512

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.4222
  Weighted F1: 0.7431
  Accuracy: 0.7604


Training classifiers:  86%|████████▌ | 6/7 [00:03<00:00,  2.68it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.2113
  Weighted F1: 0.7127
  Accuracy: 0.7478

✓ NB: Trained in 0.01s
  Macro F1: 0.2098
  Weighted F1: 0.7122
  Accuracy: 0.7467


Training classifiers: 100%|██████████| 7/7 [00:08<00:00,  1.16s/it]



✓ RF: Trained in 4.48s
  Macro F1: 0.3689
  Weighted F1: 0.7339
  Accuracy: 0.7497

Results saved to: /kaggle/working/results_House2_Th40_WindowingFalse.pkl

Configuration: Without_Windowing, Threshold=50

################################################################################
EXPERIMENT: House 2, Threshold=50, Windowing=OFF
################################################################################
Discovered 6 appliances: ['dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit']
Found 7 files for house 2


Loading CSV files: 100%|██████████| 7/7 [00:00<00:00, 40.01it/s]


Note: Using 100,000 samples out of 292,063
Loaded 100,000 samples
Available columns: ['unnamed:_0', 'dish_washer', 'electric_stove', 'fridge', 'microwave', 'washer_dryer', 'waste_disposal_unit', 'main']

Using binary labeling without power windowing (paper baseline)

Thresholding Analysis (Threshold = 50):
Total samples: 100,000
Unique combinations before thresholding: 14
Unique combinations after thresholding: 9
Samples after thresholding: 99,974 (100.0%)
Removed samples: 26

Top 10 appliance combinations:
  001110 : dis:0|ele:0|fri:1|mic:1|was:1|was:0 :  45,680 occurrences
  011110 : dis:0|ele:1|fri:1|mic:1|was:1|was:0 :  43,169 occurrences
  111110 : dis:1|ele:1|fri:1|mic:1|was:1|was:0 :   4,080 occurrences
  101110 : dis:1|ele:0|fri:1|mic:1|was:1|was:0 :   4,018 occurrences
  101111 : dis:1|ele:0|fri:1|mic:1|was:1|was:1 :   1,200 occurrences
  001100 : dis:0|ele:0|fri:1|mic:1|was:0|was:0 :   1,029 occurrences
  111111 : dis:1|ele:1|fri:1|mic:1|was:1|was:1 :     530 occurrences
  01

Training classifiers:  14%|█▍        | 1/7 [00:00<00:01,  4.94it/s]


✓ CART: Trained in 0.14s
  Macro F1: 0.3735
  Weighted F1: 0.7338
  Accuracy: 0.7503


Training classifiers:  29%|██▊       | 2/7 [00:02<00:06,  1.20s/it]


✓ ET: Trained in 1.63s
  Macro F1: 0.3789
  Weighted F1: 0.7355
  Accuracy: 0.7521


Training classifiers:  57%|█████▋    | 4/7 [00:03<00:02,  1.42it/s]


✓ KNN: Trained in 0.03s
  Macro F1: 0.3434
  Weighted F1: 0.7279
  Accuracy: 0.7512

✓ KNN-CB: Trained in 0.03s
  Macro F1: 0.4222
  Weighted F1: 0.7431
  Accuracy: 0.7604


Training classifiers:  86%|████████▌ | 6/7 [00:03<00:00,  2.73it/s]


✓ LDA: Trained in 0.03s
  Macro F1: 0.2113
  Weighted F1: 0.7127
  Accuracy: 0.7478

✓ NB: Trained in 0.01s
  Macro F1: 0.2098
  Weighted F1: 0.7122
  Accuracy: 0.7467


Training classifiers: 100%|██████████| 7/7 [00:07<00:00,  1.13s/it]


✓ RF: Trained in 4.38s
  Macro F1: 0.3689
  Weighted F1: 0.7339
  Accuracy: 0.7497

Results saved to: /kaggle/working/results_House2_Th50_WindowingFalse.pkl

################################################################################
EXPERIMENT SUMMARY
################################################################################

With_Windowing:
----------------------------------------
  Threshold  5: Best = KNN     , Macro F1 = 0.5585, Classes =   7, Samples = 99,999
  Threshold 10: Best = KNN     , Macro F1 = 0.6525, Classes =   6, Samples = 99,994
  Threshold 20: Best = KNN     , Macro F1 = 0.6525, Classes =   6, Samples = 99,994
  Threshold 30: Best = KNN-CB  , Macro F1 = 0.6641, Classes =   5, Samples = 99,969
  Threshold 40: Best = KNN-CB  , Macro F1 = 0.6641, Classes =   5, Samples = 99,969
  Threshold 50: Best = KNN-CB  , Macro F1 = 0.6641, Classes =   5, Samples = 99,969

Without_Windowing:
----------------------------------------
  Threshold  5: Best = KNN-CB  , Macr


