# Anomaly Detection with using PYOD and ADTK

In [None]:
!pip install pyod adtk

Connect your Google Drive account and transfer the dataset wherever you want. Make sure this folder is not used.
After the connection, unzip the zip file and make sure that the dataset is unzipped as desired. The final file format should be as follows:

> your_path/dataset/


          ->../helios
            ->../user_helios_sorted
          ->../queensland
            ->../user_sorted_pulse
            ->../user_sorted_pulsetot
          ->../datamill
              ->../user_datamill_sorted

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
## go to your path
%cd /content/drive/MyDrive/kentkart/water_meter_dataset/ ##change this part with your dataset location
%ls

In [None]:
!unzip dataset.zip

##Anomaly Detection with ADTK without Training



In [None]:
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
from adtk.data import validate_series
from adtk.detector import ThresholdAD, InterQuartileRangeAD, PersistAD, LevelShiftAD, VolatilityShiftAD

def process_datamill(file_path):
    df = pd.read_csv(file_path)
    df['READING_START_DATE'] = pd.to_datetime(df['READING_START_DATE'], format='%d/%m/%Y %H:%M')
    df = df.set_index('READING_START_DATE')
    return df['GROSS_CONSUMPTION']

def process_helios(file_path, option):
    df = pd.read_csv(file_path, sep=';')
    df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
    df = df.set_index('datetime')
    if option == 'daily':
        return df['diff']
    else:  # total
        return df['meter reading']

def process_queensland(file_path, option):
    df = pd.read_csv(file_path)
    df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
    df = df.set_index('datetime')
    if option == 'pulse1':
        return df['Pulse1']
    else:  # pulse1_total
        return df['Pulse1_Total']

def detect_anomalies(s, contamination, z_score_threshold):
    s = validate_series(s)

    threshold_ad = ThresholdAD(high=s.mean() + z_score_threshold * s.std(),
                               low=s.mean() - z_score_threshold * s.std())
    iqr_ad = InterQuartileRangeAD(c=1.5)
    persist_ad = PersistAD(c=3.0, side='positive')
    level_shift_ad = LevelShiftAD(c=2.0, side='both', window=5)
    volatility_shift_ad = VolatilityShiftAD(c=1.5, side='positive', window=30)

    anomalies = {}
    anomalies['Threshold'] = threshold_ad.detect(s).fillna(False)
    anomalies['IQR'] = iqr_ad.fit_detect(s).fillna(False)
    anomalies['Persist'] = persist_ad.fit_detect(s).fillna(False)
    anomalies['LevelShift'] = level_shift_ad.fit_detect(s).fillna(False)
    try:
        anomalies['VolatilityShift'] = volatility_shift_ad.fit_detect(s).fillna(False)
    except RuntimeError as e:
        print(f"VolatilityShiftAD could not be applied: {e}")
        anomalies['VolatilityShift'] = pd.Series(False, index=s.index)

    return anomalies


def plot_consensus_anomalies(s, anomalies, file_path):
    # Calculate the consensus anomalies where all models agree
    consensus_anomalies = pd.DataFrame(anomalies).all(axis=1)

    # Drop any missing values to avoid index alignment issues
    consensus_anomalies = consensus_anomalies.dropna()
    s = s.dropna()

    # Filter the consensus anomalies
    consensus_indices = consensus_anomalies[consensus_anomalies].index

    # Ensure indices in s align with the consensus_indices
    consensus_values = s.loc[consensus_indices]

    # Check if both consensus_indices and consensus_values have the same length
    if len(consensus_indices) != len(consensus_values):
        print(f"Index mismatch detected. Length of indices: {len(consensus_indices)}, Length of values: {len(consensus_values)}")
        return

    # Plot the data and anomalies
    plt.figure(figsize=(12, 6))
    plt.plot(s, label='Data', color='blue')
    plt.scatter(consensus_indices,
                consensus_values,
                label='Consensus Anomaly',
                marker='o',
                color='red',
                s=20)
    plt.title(f"Consensus Anomalies detected in {os.path.basename(file_path)}")
    plt.legend()
    plt.show()


def main(folder_path, dataset_type, option, contamination, z_score_threshold):
    file_list = glob.glob(os.path.join(folder_path, '*.csv'))

    for file_path in file_list:
        print(f"Processing file: {file_path}")

        if dataset_type == 'datamill':
            s = process_datamill(file_path)
        elif dataset_type == 'helios':
            s = process_helios(file_path, option)
        elif dataset_type == 'queensland':
            s = process_queensland(file_path, option)

        anomalies = detect_anomalies(s, contamination, z_score_threshold)

        total_consensus_anomalies = sum(pd.DataFrame(anomalies).all(axis=1))
        print(f"Total consensus anomalies across all models: {total_consensus_anomalies}")

        plot_consensus_anomalies(s, anomalies, file_path)

if __name__ == "__main__":
    dataset_type = input("Enter dataset type (helios/queensland/datamill): ").lower()
    while dataset_type not in ['helios', 'queensland', 'datamill']:
        dataset_type = input("Invalid input. Please enter 'helios', 'queensland', or 'datamill': ").lower()

    if dataset_type == 'helios':
        option = input("Enter option (daily/total): ").lower()
        while option not in ['daily', 'total']:
            option = input("Invalid input. Please enter 'daily' or 'total': ").lower()
    elif dataset_type == 'queensland':
        option = input("Enter option (pulse1/pulse1_total): ").lower()
        while option not in ['pulse1', 'pulse1_total']:
            option = input("Invalid input. Please enter 'pulse1' or 'pulse1_total': ").lower()
    else:
        option = 'default'

    try:
        z_score_threshold = float(input("Enter Z-score threshold (default 1 for datamill and helios, 3 for queensland): "))
    except ValueError:
        print("Invalid Z-score threshold. Using default value of 3.")
        z_score_threshold = 3

    try:
        contamination = float(input("Enter contamination factor (default 0.01): "))
    except ValueError:
        print("Invalid contamination factor. Using default value of 0.01.")
        contamination = 0.01

    if dataset_type == 'helios':

        folder_path = './dataset/helios/user_helios_sorted/'
    elif dataset_type == 'queensland':
        if option == 'pulse1':

            folder_path = './dataset/queensland/user_sorted_pulse/'
        else:

            folder_path = './dataset/queensland/user_sorted_pulsetot/'
    else:

        folder_path = './dataset/datamill/user_datamill_sorted/'

    main(folder_path, dataset_type, option, contamination, z_score_threshold)


##Anomaly Detection with PYOD with single file training

In [None]:
import os
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.auto_encoder import AutoEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def process_file(file_path, dataset_type, option, contamination=0.01, models=None, z_score_threshold=3):
    # Read the CSV file
    df = pd.read_csv(file_path, sep=';') if 'helios' in dataset_type else pd.read_csv(file_path)

    # Convert datetime to pandas datetime
    if dataset_type == 'helios':
        df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
        if option == 'daily':
            df['diff'] = df['diff']
        elif option == 'total':
            df['diff'] = df['meter reading'].diff()

    elif dataset_type == 'queensland':
        try:
            df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
            if option == 'pulse1':
                df['diff'] = df['Pulse1'].diff()
            elif option == 'pulse1_total':
                df['diff'] = df['Pulse1_Total'].diff()
        except ValueError as e:
            print(f"Error converting datetime: {e}")
            print(f"First few datetime values: {df['datetime'].head()}")
            raise

    elif dataset_type == 'datamill':
        df['datetime'] = pd.to_datetime(df['READING_START_DATE'], format='%d/%m/%Y %H:%M')
        df['diff'] = df['GROSS_CONSUMPTION'].diff()

    # Sort by datetime
    df = df.sort_values('datetime')

    # Extract hour and day of week
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek

    # Calculate rolling statistics
    window_size = 24 if dataset_type == 'helios' else 7
    df['rolling_mean'] = df['diff'].rolling(window=window_size).mean()
    df['rolling_std'] = df['diff'].rolling(window=window_size).std()

    # Calculate Z-scores
    df['z_score'] = (df['diff'] - df['rolling_mean']) / df['rolling_std']

    # Prepare features for anomaly detection
    features = ['diff', 'hour', 'day_of_week', 'rolling_mean', 'rolling_std']

    # Handle NaN values
    df[features] = df[features].fillna(df[features].mean())

    X = df[features].values

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize results dictionary
    results = {}

    # Anomaly Detection Models
    for model_name, model in models.items():
        model.fit(X_scaled)
        outlier_scores = model.decision_function(X_scaled)
        df[f'{model_name}_anomaly_score'] = outlier_scores
        df[f'{model_name}_is_anomaly'] = model.predict(X_scaled)
        df[f'{model_name}_is_validated_anomaly'] = (df[f'{model_name}_is_anomaly'] == 1) & (abs(df['z_score']) > z_score_threshold)
        results[model_name] = df[f'{model_name}_is_validated_anomaly'].sum()

    return df, results

def plot_results(df, user_key, dataset_type, results):
    plt.figure(figsize=(12, 6))
    x = df['datetime']
    y = df['diff']
    title = f'Validated Anomalies for {user_key} ({dataset_type})'

    plt.plot(x, y, label='Consumption', alpha=0.5)

    for model_name in results:
        validated_anomalies = df[df[f'{model_name}_is_validated_anomaly']]
        plt.scatter(validated_anomalies[x.name], validated_anomalies[y.name], label=f'{model_name} Validated Anomalies')

    plt.title(title)
    plt.xlabel('DateTime')
    plt.ylabel('Consumption Difference')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def main(folder_path, dataset_type, option, contamination, z_score_threshold):
    # Define models
    models = {
        'IForest': IForest(contamination=contamination, random_state=42),
        'KNN': KNN(contamination=contamination, n_neighbors=5),
        'LOF': LOF(contamination=contamination),
        'AutoEncoder': AutoEncoder(contamination=contamination, epoch_num=10)
    }

    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)

            df, results = process_file(file_path, dataset_type, option, contamination, models, z_score_threshold)

            user_key = filename

            print(f"Processing data for file: {filename}")
            print(f"Total data points: {len(df)}")
            for model_name, count in results.items():
                print(f"{model_name} validated anomalies: {count}")

            # Plot the results
            plot_results(df, user_key, dataset_type, results)

# Get user input for dataset type
dataset_type = input("Enter dataset type (helios/queensland/datamill): ").lower()
while dataset_type not in ['helios', 'queensland', 'datamill']:
    dataset_type = input("Invalid input. Please enter 'helios', 'queensland', or 'datamill': ").lower()

# Get user input for options
if dataset_type == 'helios':
    option = input("Enter option (daily/total): ").lower()
    while option not in ['daily', 'total']:
        option = input("Invalid input. Please enter 'daily' or 'total': ").lower()
elif dataset_type == 'queensland':
    option = input("Enter option (pulse1/pulse1_total): ").lower()
    while option not in ['pulse1', 'pulse1_total']:
        option = input("Invalid input. Please enter 'pulse1' or 'pulse1_total': ").lower()
else:
    option = 'default'

# Get user input for Z-score and contamination
try:
    z_score_threshold = float(input("Enter Z-score threshold (default 1 for datamill and helios 3 for queensland): "))
except ValueError:
    print("Invalid Z-score threshold. Using default value of 3.")
    z_score_threshold = 3

try:
    contamination = float(input("Enter contamination factor (default 0.01): "))
except ValueError:
    print("Invalid contamination factor. Using default value of 0.01.")
    contamination = 0.01

# Set the folder path based on the dataset type
if dataset_type == 'helios':
    #change this to your path,
        #For example:
        #folder path = /content/drive/MyDrive/your_path/dataset/helios/user_helios_sorted/'
        folder_path = './dataset/helios/user_helios_sorted/'
elif dataset_type == 'queensland':
    if option == 'pulse1':

            folder_path = './dataset/queensland/user_sorted_pulse/'
    else:

            folder_path = './dataset/queensland/user_sorted_pulsetot/'
else:

    folder_path = './dataset/datamill/user_datamill_sorted/'

main(folder_path, dataset_type, option, contamination, z_score_threshold)


##Training Whole Dataset for Anomaly Detection

The LOF model for the Helios dataset may take a long time or may run out of RAM. You can cancel long running models with the command line.

In [None]:
import os
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.auto_encoder import AutoEncoder
from sklearn.preprocessing import StandardScaler
import joblib

def process_file(file_path, value_column, contamination=0.01):
    print(f"Processing file: {file_path}")
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        if value_column == 'GROSS_CONSUMPTION' or value_column =='DAILY_AVERAGE_CONSUMPTION':
            df['datetime'] = pd.to_datetime(df['READING_START_DATE'], format='%d/%m/%Y %H:%M')
        else:

            df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
        df = df.sort_values('datetime')

        print(f"DataFrame shape: {df.shape}")
        print(f"Columns: {df.columns}")

        # Extract hour and day of week
        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.dayofweek

        # Calculate rolling statistics
        window_size = 7
        df['diff'] = df[value_column].diff()
        df['rolling_mean'] = df['diff'].rolling(window=window_size).mean()
        df['rolling_std'] = df['diff'].rolling(window=window_size).std()

        # Calculate Z-scores
        df['z_score'] = (df['diff'] - df['rolling_mean']) / df['rolling_std']

        # Prepare features for anomaly detection
        features = ['diff', 'day_of_week', 'rolling_mean', 'rolling_std']

        # Handle NaN values
        df[features] = df[features].fillna(df[features].mean())

        # Check for remaining NaN values
        if df[features].isnull().values.any():
            print("NaN values found after filling with mean:")
            print(df[features].isnull().sum())
            df[features] = df[features].fillna(0)  # Fallback to filling NaNs with 0 if any are left

        X = df[features].values

        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        return X_scaled
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

def train_and_save_models(folder_path, value_column, model_save_path, contamination=0.01):
    print(f"Starting training with folder_path: {folder_path}")

    # Define models
    models = {
        'IForest': IForest(contamination=contamination, random_state=42),
        'KNN': KNN(contamination=contamination, n_neighbors=5),
        'LOF': LOF(contamination=contamination),
        'AutoEncoder': AutoEncoder(contamination=contamination)
    }

    try:
        file_list = os.listdir(folder_path)
        print(f"Files in directory: {file_list}")

        X_combined = []

        for filename in file_list:
            if filename.endswith('.csv'):
                file_path = os.path.join(folder_path, filename)

                X_scaled = process_file(file_path, value_column, contamination)
                if X_scaled is not None:
                    X_combined.append(X_scaled)

        if X_combined:
            X_combined = np.vstack(X_combined)
            print(f"Combined feature matrix shape: {X_combined.shape}")

            # Ensure the model save path exists
            os.makedirs(model_save_path, exist_ok=True)

            for model_name, model in models.items():
                model.fit(X_combined)
                model_filename = os.path.join(model_save_path, f'{model_name}_model.pkl')
                joblib.dump(model, model_filename)
                print(f"Saved {model_name} model to {model_filename}")

    except Exception as e:
        print(f"Error in training and saving models: {str(e)}")

# Set the dataset type and value column based on the user's choice
dataset_type = input("Enter dataset type (helios/queensland/datamill): ").lower()
while dataset_type not in ['helios', 'queensland', 'datamill']:
    dataset_type = input("Invalid input. Please enter 'helios', 'queensland', or 'datamill': ").lower()

if dataset_type == 'helios':

    folder_path = './dataset/helios/user_helios_sorted'

    value_type = input("Enter data type (daily/total): ").lower()

    while value_type not in ['daily', 'total']:
        value_type = input("Invalid input. Please enter 'daily' or 'total': ").lower()

    if value_type == 'daily':
        model_save_path = './models/helios/daily'
        value_column = 'diff'
    else:
        model_save_path = './models/helios/total'
        value_column = 'meter reading'

elif dataset_type == 'queensland':

    value_type = input("Enter data type (daily/total): ").lower()

    while value_type not in ['daily', 'total']:
        value_type = input("Invalid input. Please enter 'daily' or 'total': ").lower()

    if value_type == 'daily':
        folder_path = './dataset/queensland/user_sorted_pulse'
        model_save_path = './models/queensland/daily'
        value_column = 'Pulse1'
    else:
        folder_path = './dataset/queensland/user_sorted_pulsetot'
        model_save_path = './models/queensland/total'
        value_column = 'Pulse1_Total'

else:
    folder_path = './dataset/datamill/user_datamill_sorted'  # Replace with the actual path
    value_type = input("Enter data type (daily/total): ").lower()
    while value_type not in ['daily', 'total']:
        value_type = input("Invalid input. Please enter 'daily' or 'total': ").lower()

    if value_type == 'daily':
        model_save_path = './models/datamill/daily/'
        value_column = 'DAILY_AVERAGE_CONSUMPTION'
    else:
        model_save_path = './models/datamill/total/'
        value_column = 'GROSS_CONSUMPTION'

print(f"Selected dataset type: {dataset_type}")
print(f"Selected pulse type: {value_column}")
print(f"Folder path: {folder_path}")
print(f"Model save path: {model_save_path}")

try:
    train_and_save_models(folder_path, value_column, model_save_path, contamination=0.01)
except Exception as e:
    print(f"An error occurred: {str(e)}")


## Prediction with Trained Models

In [None]:
import os
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.auto_encoder import AutoEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import joblib

def load_models(models_path):
    models = {}
    for model_name in ['IForest_model', 'KNN_model', 'LOF_model', 'AutoEncoder_model']:
        model_file = os.path.join(models_path, f'{model_name}.pkl')
        if os.path.exists(model_file):
            try:
                models[model_name] = joblib.load(model_file)
                print(f"Loaded {model_name} model.")
            except Exception as e:
                print(f"Error loading {model_name} model: {str(e)}")
        else:
            print(f"Model file {model_file} does not exist.")
    return models

def process_file(file_path, contamination, models, dataset_type, value_type, value_column):
    print(f"Processing file: {file_path}")
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
        df = df.sort_values('datetime')

        print(f"DataFrame shape: {df.shape}")
        print(f"Columns: {df.columns}")

        # Extract hour and day of week
        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.dayofweek

        # Calculate rolling statistics
        window_size = 24 if dataset_type == 'helios' else 7
        df['diff'] = df[value_column].diff()
        df['rolling_mean'] = df[value_column].rolling(window=window_size).mean()
        df['rolling_std'] = df[value_column].rolling(window=window_size).std()

        # Calculate Z-scores
        df['z_score'] = (df[value_column] - df['rolling_mean']) / df['rolling_std']

        # Prepare features for anomaly detection
        features = ['hour', 'day_of_week', 'rolling_mean', 'rolling_std']

        # Handle NaN values
        df[features] = df[features].fillna(df[features].mean())

        X = df[features].values

        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Initialize results dictionary
        results = {}

        # Anomaly Detection Models
        for model_name, model in models.items():
            try:
                outlier_scores = model.decision_function(X_scaled)
                df[f'{model_name}_anomaly_score'] = outlier_scores
                df[f'{model_name}_is_anomaly'] = model.predict(X_scaled)
                z_score_threshold = 3 if dataset_type == 'helios' else 1
                df[f'{model_name}_is_validated_anomaly'] = (df[f'{model_name}_is_anomaly'] == 1) & (abs(df['z_score']) > z_score_threshold)
                results[model_name] = df[f'{model_name}_is_validated_anomaly'].sum()
            except Exception as e:
                print(f"Error applying {model_name} model: {str(e)}")

        # Create a new column for points that are anomalies according to all methods
        df['all_methods_anomaly'] = df[[f'{model_name}_is_validated_anomaly' for model_name in models]].all(axis=1)
        results['all_methods'] = df['all_methods_anomaly'].sum()

        return df, results
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None, None


def plot_results(df, user_key, dataset_type, value_type, value_column, results):
    try:
        plt.figure(figsize=(12, 6))
        x = df['datetime']
        y = df[value_column]
        title = f'Anomalies Detected by All Methods for {user_key}'

        plt.plot(x, y, label='Consumption', alpha=0.5)

        all_methods_anomalies = df[df['all_methods_anomaly']]
        plt.scatter(all_methods_anomalies[x.name], all_methods_anomalies[y.name], label='Anomalies (All Methods)', color='red')

        plt.title(title)
        plt.xlabel('DateTime')
        plt.ylabel('Consumption')
        plt.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Error plotting results: {str(e)}")

def main(folder_path, model_save_path, dataset_type, value_type, value_column, contamination=0.01):
    print(f"Starting main function with folder_path: {folder_path} and model_save_path: {model_save_path}")

    # Load pre-trained models
    models = load_models(model_save_path)

    # Check if models are loaded correctly
    if not models:
        print("No models loaded. Exiting.")
        return

    try:
        file_list = os.listdir(folder_path)
        print(f"Files in directory: {file_list}")

        for filename in file_list:
            if filename.endswith('.csv'):
                file_path = os.path.join(folder_path, filename)

                df, results = process_file(file_path, contamination, models, dataset_type, value_type, value_column)

                if df is not None and results is not None:
                    user_key = df['user key'].iloc[0] if dataset_type == 'helios' else filename.split('.')[0]

                    print(f"Processing data for file: {filename}")
                    print(f"Total data points: {len(df)}")
                    for model_name, count in results.items():
                        print(f"{model_name} validated anomalies: {count}")

                    # Plot the results
                    plot_results(df, user_key, dataset_type, value_type, value_column, results)
                else:
                    print(f"Skipping file {filename} due to processing error")
    except Exception as e:
        print(f"Error in main function: {str(e)}")

# Get user input for dataset type and value type
dataset_type = input("Enter dataset type (helios/queensland/datamill): ").lower()
while dataset_type not in ['helios', 'queensland', 'datamill']:
    dataset_type = input("Invalid input. Please enter 'helios', 'queensland', or 'datamill': ").lower()

value_type = input("Enter data type (daily/total): ").lower()
while value_type not in ['daily', 'total']:
    value_type = input("Invalid input. Please enter 'daily' or 'total': ").lower()

# Set the folder path, model save path, and value column based on the dataset type and value type
if dataset_type == 'helios':
    folder_path = './dataset/helios/user_helios_sorted'
    model_save_path = f'./models/helios/{value_type}'
    value_column = 'diff' if value_type == 'daily' else 'meter reading'
elif dataset_type == 'queensland':
    folder_path = f'./dataset/queensland/user_sorted_pulse{"tot" if value_type == "total" else ""}'
    model_save_path = f'./models/queensland/{value_type}'
    value_column = 'Pulse1' if value_type == 'daily' else 'Pulse1_Total'
else:  # datamill
    folder_path = './dataset/datamill/user_datamill_sorted'
    model_save_path = f'./models/datamill/{value_type}'
    value_column = 'DAILY_AVERAGE_CONSUMPTION' if value_type == 'daily' else 'GROSS_CONSUMPTION'

print(f"Selected dataset type: {dataset_type}")
print(f"Selected value type: {value_type}")
print(f"Value column: {value_column}")
print(f"Folder path: {folder_path}")
print(f"Model save path: {model_save_path}")

try:
    main(folder_path, model_save_path, dataset_type, value_type, value_column, contamination=0.01)
except Exception as e:
    print(f"An error occurred: {str(e)}")