In [2]:
!pip install adtk
!pip install pyod

Collecting adtk
  Downloading adtk-0.6.2-py3-none-any.whl.metadata (4.7 kB)
Downloading adtk-0.6.2-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: adtk
Successfully installed adtk-0.6.2
Collecting pyod
  Downloading pyod-2.0.1.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-2.0.1-py3-none-any.whl size=193269 sha256=f861ca01b27314aec1e8bd999f589d218a48e4a9bb3a74517063d8ece2ae3428
  Stored in directory: /root/.cache/pip/wheels/94/75/88/b853cf33b0053b0a001dca55b74d515048b7656e736364eb57
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
%cd /content/drive/MyDrive/kentkart/water meter dataset/dataset/queensland/
%ls
!unzip sorted_queensland.zip

/content/drive/MyDrive/kentkart/water meter dataset/dataset/queensland
sorted_queensland.zip
Archive:  sorted_queensland.zip
   creating: sorted_pulsetot/
  inflating: sorted_pulsetot/60193575_Pulse1_Total.csv  
  inflating: sorted_pulsetot/60255908_Pulse1_Total.csv  
  inflating: sorted_pulsetot/64629928_Pulse1_Total.csv  
  inflating: sorted_pulsetot/314254_Pulse1_Total.csv  
  inflating: sorted_pulsetot/47604269_Pulse1_Total.csv  
  inflating: sorted_pulsetot/731394_Pulse1_Total.csv  
  inflating: sorted_pulsetot/81968205_Pulse1_Total.csv  
  inflating: sorted_pulsetot/61319259_Pulse1_Total.csv  
  inflating: sorted_pulsetot/60218899_Pulse1_Total.csv  
  inflating: sorted_pulsetot/2266_Pulse1_Total.csv  
  inflating: sorted_pulsetot/60584922_Pulse1_Total.csv  
  inflating: sorted_pulsetot/61148634_Pulse1_Total.csv  
  inflating: sorted_pulsetot/60193546_Pulse1_Total.csv  
  inflating: sorted_pulsetot/409_Pulse1_Total.csv  
  inflating: sorted_pulsetot/65087788_Pulse1_Total.csv  
  i

In [None]:
%cd /content/drive/MyDrive/kentkart/water meter dataset/dataset/helios/
%ls
!unzip helios_dataset.zip

ADTK CODE

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from adtk.data import validate_series
from adtk.detector import (
    ThresholdAD, QuantileAD, PersistAD, SeasonalAD,
    InterQuartileRangeAD, AutoregressionAD
)
import glob
import os

def plot_water_usage_from_files(csv_folder):
    # Find all CSV files in the specified folder
    csv_files = glob.glob(os.path.join(csv_folder, '*.csv'))

    # Iterate through each CSV file
    for csv_file in csv_files:
        print(f"Processing file: {csv_file}")

        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_file)

        # Convert 'time' column to datetime format
        df['time'] = pd.to_datetime(df['time'])

        # Filter only the rows where typeM is 'Pulse1_Total' and Value is numeric
        filtered_data = df[df['typeM'] == 'Pulse1_Total']
        filtered_data['Value'] = pd.to_numeric(filtered_data['Value'], errors='coerce')

        # Group by 'time' and sum 'Value' for water usage over time
        usage_by_time = filtered_data.groupby('time')['Value'].sum()

        # Create and validate the time series
        series = validate_series(usage_by_time)

        # Train a Linear Regression model (as an example)
        X = np.arange(len(series)).reshape(-1, 1)  # Time as feature
        y = series.values

        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Initialize and train the model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Predict and visualize the results
        y_pred = model.predict(X)
        plt.figure(figsize=(15, 5))
        plt.plot(series.index, series.values, label='Data')
        plt.plot(series.index, y_pred, color='green', label='Fitted Line')
        plt.title('Linear Regression Fit')
        plt.legend()
        plt.xlabel('Time')
        plt.ylabel('Value')
        plt.tight_layout()
        plt.show()

        # Define anomaly detection methods
        threshold_ad = ThresholdAD(high=series.quantile(0.99), low=series.quantile(0.01))
        quantile_ad = QuantileAD(high=0.99, low=0.01)
        persist_ad = PersistAD(min_duration='1 hour', threshold=series.quantile(0.99))
        seasonal_ad = SeasonalAD(c=3.0, side='both')
        iqr_ad = InterQuartileRangeAD(c=1.5)
        autoregression_ad = AutoregressionAD(n_steps=10, step_size=1)

        # Detect anomalies
        anomalies_threshold = threshold_ad.detect(series)
        anomalies_quantile = quantile_ad.fit_detect(series)
        anomalies_persist = persist_ad.detect(series)
        anomalies_seasonal = seasonal_ad.detect(series)
        anomalies_iqr = iqr_ad.detect(series)
        anomalies_autoregression = autoregression_ad.fit_detect(series)  # Fixed

        # Visualize anomaly detections
        def plot_anomalies(series, anomalies, title):
            plt.figure(figsize=(15, 5))
            plt.plot(series.index, series.values, label='Data')
            anomalies = anomalies.fillna(False)  # Handle NaN values
            plt.scatter(series[anomalies].index, series[anomalies].values, color='red', label='Anomalies')
            plt.title(title)
            plt.legend()
            plt.xlabel('Time')
            plt.ylabel('Value')
            plt.tight_layout()
            plt.show()

        # Plot each anomaly detection method
        plot_anomalies(series, anomalies_threshold, 'Threshold Anomalies')
        plot_anomalies(series, anomalies_quantile, 'Quantile Anomalies')
        plot_anomalies(series, anomalies_persist, 'Persist Anomalies')
        plot_anomalies(series, anomalies_seasonal, 'Seasonal Anomalies')
        plot_anomalies(series, anomalies_iqr, 'IQR Anomalies')
        plot_anomalies(series, anomalies_autoregression, 'Autoregression Anomalies')

        # Combine anomaly detections
        anomalies_combined = (anomalies_threshold | anomalies_quantile |
                              anomalies_persist | anomalies_seasonal |
                              anomalies_iqr | anomalies_autoregression)

        # Plot combined anomalies
        plot_anomalies(series, anomalies_combined, 'Combined Anomalies')

        # Print results
        print("Threshold Anomalies:\n", anomalies_threshold)
        print("Quantile Anomalies:\n", anomalies_quantile)
        print("Persist Anomalies:\n", anomalies_persist)
        print("Seasonal Anomalies:\n", anomalies_seasonal)
        print("IQR Anomalies:\n", anomalies_iqr)
        print("Autoregression Anomalies:\n", anomalies_autoregression)
        print("Combined Anomalies:\n", anomalies_combined)

def process_helios_data(csv_file):
    # Read the first 1000 rows of the CSV file into a DataFrame
    df = pd.read_csv(csv_file, delimiter=';')

    # Convert 'datetime' column to datetime format
    df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')

    # Sort the dataframe by datetime
    df = df.sort_values('datetime')

    # Create and validate the time series
    series = validate_series(df.set_index('datetime')['diff'])

    # Define anomaly detection methods
    threshold_ad = ThresholdAD(high=series.quantile(0.9999), low=series.quantile(0.0001))
    quantile_ad = QuantileAD(high=0.99999, low=0.00001)
    persist_ad = PersistAD(c=3.0, side='positive')  # Fixed
    seasonal_ad = SeasonalAD(c=3.0, side='both')
    iqr_ad = InterQuartileRangeAD(c=1.5)
    autoregression_ad = AutoregressionAD(n_steps=7*2, step_size=24, c=3.0)

    # Detect anomalies
    anomalies_threshold = threshold_ad.detect(series)
    anomalies_quantile = quantile_ad.fit_detect(series)
    anomalies_persist = persist_ad.fit_detect(series)
    anomalies_iqr = iqr_ad.fit_detect(series)
    anomalies_autoregression = autoregression_ad.fit_detect(series)  # Fixed

    # Combine anomaly detections
    anomalies_combined = (anomalies_threshold | anomalies_quantile |
                          anomalies_persist |
                          anomalies_iqr | anomalies_autoregression)

    # Visualize anomaly detections
    def plot_anomalies(series, anomalies, title):
        plt.figure(figsize=(15, 5))
        plt.plot(series.index, series.values, label='Data')
        anomalies = anomalies.fillna(False)  # Handle NaN values
        plt.scatter(series[anomalies].index, series[anomalies].values, color='red', label='Anomalies')
        plt.title(title)
        plt.legend()
        plt.xlabel('Time')
        plt.ylabel('Meter Reading')
        plt.tight_layout()
        plt.show()

    # Plot each anomaly detection method
    plot_anomalies(series, anomalies_threshold, 'Threshold Anomalies')
    plot_anomalies(series, anomalies_quantile, 'Quantile Anomalies')
    plot_anomalies(series, anomalies_persist, 'Persist Anomalies')
    plot_anomalies(series, anomalies_iqr, 'IQR Anomalies')
    plot_anomalies(series, anomalies_autoregression, 'Autoregression Anomalies')

    # Plot combined anomalies
    plot_anomalies(series, anomalies_combined, 'Combined Anomalies')

    # Print results
    print("Threshold Anomalies:\n", anomalies_threshold)
    print("Quantile Anomalies:\n", anomalies_quantile)
    print("Persist Anomalies:\n", anomalies_persist)
    print("IQR Anomalies:\n", anomalies_iqr)
    print("Autoregression Anomalies:\n", anomalies_autoregression)
    print("Combined Anomalies:\n", anomalies_combined)

def main():
    dataset_choice = input("Choose dataset (1 for water meter, 2 for Helios): ")

    if dataset_choice == '1':
        # Specify the folder where your water meter CSV files are located in Google Drive
        csv_folder = '/content/drive/MyDrive/dataset'
        # Call the function to plot water usage for each CSV file
        plot_water_usage_from_files(csv_folder)
    elif dataset_choice == '2':
        # Specify the Helios CSV file in Google Drive
        csv_file = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/helios/user dataset/ffa1ece2-baae-4ea7-8e68-fe4110467a8d.csv'
        # Call the function to process and plot Helios data
        process_helios_data(csv_file)
    else:
        print("Invalid choice. Please choose 1 or 2.")

if __name__ == '__main__':
    main()


Choose dataset (1 for water meter, 2 for Helios): 1


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from adtk.data import validate_series
from adtk.detector import (
    ThresholdAD, QuantileAD, PersistAD, SeasonalAD,
    InterQuartileRangeAD, AutoregressionAD
)
import glob
import os

def train_model_and_detect_anomalies(csv_folder):
    csv_files = glob.glob(os.path.join(csv_folder, '*.csv'))

    if not csv_files:
        print(f"No CSV files found in the folder: {csv_folder}")
        return

    all_features = []
    all_targets = []
    all_datetimes = []

    for csv_file in csv_files:
        print(f"Processing file: {csv_file}")

        try:
            df = pd.read_csv(csv_file, delimiter=';')
            df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
            df = df.sort_values('datetime')

            df['hour'] = df['datetime'].dt.hour
            df['day_of_week'] = df['datetime'].dt.dayofweek
            df['month'] = df['datetime'].dt.month

            all_features.append(df[['hour', 'day_of_week', 'month']])
            all_targets.append(df['diff'])
            all_datetimes.append(df['datetime'])

        except Exception as e:
            print(f"Error processing file {csv_file}: {str(e)}")

    if not all_features or not all_targets:
        print("No valid data found in the CSV files.")
        return

    X = pd.concat(all_features, ignore_index=True)
    y = pd.concat(all_targets, ignore_index=True)
    datetimes = pd.concat(all_datetimes, ignore_index=True)

    X_train, X_test, y_train, y_test, datetime_train, datetime_test = train_test_split(
        X, y, datetimes, test_size=0.2, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    # Create a time series from the test set for anomaly detection
    test_time_series = pd.Series(y_test.values, index=datetime_test)
    series = validate_series(test_time_series)

    # Define anomaly detection methods
    threshold_ad = ThresholdAD(high=series.quantile(0.99), low=series.quantile(0.01))
    quantile_ad = QuantileAD(high=0.99, low=0.01)
    persist_ad = PersistAD(c=3.0, side='both')
    seasonal_ad = SeasonalAD(c=3.0, side='both')
    iqr_ad = InterQuartileRangeAD(c=1.5)
    autoregression_ad = AutoregressionAD(n_steps=24, step_size=1)

    # Detect anomalies
    anomalies_threshold = threshold_ad.detect(series)
    anomalies_quantile = quantile_ad.fit_detect(series)
    anomalies_persist = persist_ad.fit_detect(series)
    anomalies_seasonal = seasonal_ad.fit_detect(series)
    anomalies_iqr = iqr_ad.fit_detect(series)
    anomalies_autoregression = autoregression_ad.fit_detect(series)

    # Combine anomaly detections
    anomalies_combined = (anomalies_threshold | anomalies_quantile |
                          anomalies_persist | anomalies_seasonal |
                          anomalies_iqr | anomalies_autoregression)

    # Visualize anomaly detections
    def plot_anomalies(series, anomalies, title):
        plt.figure(figsize=(15, 5))
        plt.plot(series.index, series.values, label='Data')
        anomalies = anomalies.fillna(False)  # Handle NaN values
        plt.scatter(series[anomalies].index, series[anomalies].values, color='red', label='Anomalies')
        plt.title(title)
        plt.legend()
        plt.xlabel('Time')
        plt.ylabel('Difference')
        plt.tight_layout()
        plt.show()

    # Plot each anomaly detection method
    plot_anomalies(series, anomalies_threshold, 'Threshold Anomalies')
    plot_anomalies(series, anomalies_quantile, 'Quantile Anomalies')
    plot_anomalies(series, anomalies_persist, 'Persist Anomalies')
    plot_anomalies(series, anomalies_seasonal, 'Seasonal Anomalies')
    plot_anomalies(series, anomalies_iqr, 'IQR Anomalies')
    plot_anomalies(series, anomalies_autoregression, 'Autoregression Anomalies')

    # Plot combined anomalies
    plot_anomalies(series, anomalies_combined, 'Combined Anomalies')

    # Print results
    print("Threshold Anomalies:\n", anomalies_threshold)
    print("Quantile Anomalies:\n", anomalies_quantile)
    print("Persist Anomalies:\n", anomalies_persist)
    print("Seasonal Anomalies:\n", anomalies_seasonal)
    print("IQR Anomalies:\n", anomalies_iqr)
    print("Autoregression Anomalies:\n", anomalies_autoregression)
    print("Combined Anomalies:\n", anomalies_combined)

    # Plot combined anomalies
    plot_anomalies(series, anomalies_combined, 'Combined Anomalies')

    # Print results
    print("Threshold Anomalies:\n", anomalies_threshold)
    print("Quantile Anomalies:\n", anomalies_quantile)
    print("Persist Anomalies:\n", anomalies_persist)
    print("Seasonal Anomalies:\n", anomalies_seasonal)
    print("IQR Anomalies:\n", anomalies_iqr)
    print("Autoregression Anomalies:\n", anomalies_autoregression)
    print("Combined Anomalies:\n", anomalies_combined)

def main():
    # Specify the folder where your CSV files are located in Google Drive
    csv_folder = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/helios/user dataset/'

    # Check if the folder exists
    if not os.path.exists(csv_folder):
        print(f"The specified folder does not exist: {csv_folder}")
        return

    # Call the function to train the model and detect anomalies
    train_model_and_detect_anomalies(csv_folder)

if __name__ == '__main__':
    main()

In [None]:
import os
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def process_file(file_path, contamination=0.01):
    # Read the CSV file
    df = pd.read_csv(file_path, sep=';')

    # Convert datetime to pandas datetime
    df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')

    # Sort by datetime
    df = df.sort_values('datetime')

    # Extract hour and day of week
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek

    # Calculate rolling statistics
    window_size = 24  # Adjust based on your data frequency
    df['rolling_mean'] = df['diff'].rolling(window=window_size).mean()
    df['rolling_std'] = df['diff'].rolling(window=window_size).std()

    # Calculate Z-scores
    df['z_score'] = (df['diff'] - df['rolling_mean']) / df['rolling_std']

    # Prepare features for anomaly detection
    features = ['diff', 'hour', 'day_of_week', 'rolling_mean', 'rolling_std']

    # Handle NaN values
    df[features] = df[features].fillna(df[features].mean())

    X = df[features].values

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = IForest(contamination=contamination, random_state=42)
    model.fit(X_scaled)

    # Get the outlier scores
    outlier_scores = model.decision_function(X_scaled)

    # Add scores to the dataframe
    df['anomaly_score'] = outlier_scores

    # Classify points as anomalies or not
    df['is_anomaly'] = model.predict(X_scaled)

    # Validate anomalies using Z-score
    z_score_threshold = 3  # Adjust as needed
    df['is_validated_anomaly'] = (df['is_anomaly'] == 1) & (abs(df['z_score']) > z_score_threshold)

    return df

def plot_results(df, user_key):
    plt.figure(figsize=(12, 6))
    plt.plot(df['datetime'], df['diff'], label='Consumption', alpha=0.5)
    validated_anomalies = df[df['is_validated_anomaly']]
    plt.scatter(validated_anomalies['datetime'], validated_anomalies['diff'], color='red', label='Validated Anomalies')
    plt.title(f'Validated Anomalies for User: {user_key}')
    plt.xlabel('DateTime')
    plt.ylabel('Consumption Difference')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def main(folder_path, contamination=0.01):
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = process_file(file_path, contamination)

            # Extract user key from the first row
            user_key = df['user key'].iloc[0]

            print(f"Processing data for user: {user_key}")
            print(f"Total data points: {len(df)}")
            print(f"Validated anomalies: {df['is_validated_anomaly'].sum()}")

            # Plot the results
            plot_results(df, user_key)

# Replace 'your_folder_path' with the path to your folder containing CSV files
folder_path = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/helios/user dataset/'
main(folder_path, contamination=0.01)

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pyod.models.copod import COPOD

# Path to the dataset folder
data_path = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/helios/user dataset/'

# List all CSV files
all_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.csv')]

# Initialize an empty DataFrame to hold all the data
all_data = pd.DataFrame()

# Read and concatenate each file
for file in all_files:
    print(f"Processing file: {file}")
    data = pd.read_csv(file, delimiter=';')

    # Check for and drop missing values
    if data.isnull().values.any():
        print(f"Missing values detected in {file}. Dropping rows with missing values.")
        data.dropna(inplace=True)

    # Concatenate data
    all_data = pd.concat([all_data, data], ignore_index=True)

# Check for memory issues
print(f"Total rows in combined data: {len(all_data)}")

# Extract the feature columns
X = all_data[['meter reading', 'diff']]

# Split the data into training and testing sets
x_train, x_test = train_test_split(X, test_size=0.2, random_state=42)

# Initialize and train the COPOD model
clf = COPOD()
clf.fit(x_train)

# Choose one specific file to predict anomalies
selected_file = all_files[0]  # Change this to select a different file

# Read the selected file
selected_data = pd.read_csv(selected_file, delimiter=';')

# Parse the datetime column
selected_data['datetime'] = pd.to_datetime(selected_data['datetime'], format='%d/%m/%Y %H:%M:%S')

# Drop rows with missing values
selected_data.dropna(inplace=True)

# Extract the feature columns
X_selected = selected_data[['meter reading', 'diff']]

# Predict anomalies for the selected file
y_selected_scores = clf.decision_function(X_selected)

# Plot the anomaly scores
plt.figure(figsize=(14, 7))
plt.plot(X_selected.index, y_selected_scores, 'ro-', label='Anomaly Scores')
plt.title(f'Anomaly Scores for {os.path.basename(selected_file)}')
plt.xlabel('Index')
plt.ylabel('Anomaly Score')
plt.legend()
plt.show()


In [None]:
from pyod.models.auto_encoder import AutoEncoder

In [None]:
import os
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.auto_encoder import AutoEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def process_file(file_path, contamination=0.01, models=None):
    print(f"Processing file: {file_path}")
    try:
        # Read the CSV file
        if 'helios' in file_path:
            df = pd.read_csv(file_path, sep=';')
            df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
            df = df.sort_values('datetime')
            value_column = 'diff'
        else:  # Queensland dataset
            df = pd.read_csv(file_path)
            df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
            df = df.sort_values('datetime')
            value_column = 'Pulse1'

        print(f"DataFrame shape: {df.shape}")
        print(f"Columns: {df.columns}")

        # Extract hour and day of week
        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.dayofweek

        # Calculate rolling statistics
        window_size = 24 if 'helios' in file_path else 7
        df['diff'] = df[value_column] if 'helios' in file_path else df[value_column].diff()
        df['rolling_mean'] = df['diff'].rolling(window=window_size).mean()
        df['rolling_std'] = df['diff'].rolling(window=window_size).std()

        # Calculate Z-scores
        df['z_score'] = (df['diff'] - df['rolling_mean']) / df['rolling_std']

        # Prepare features for anomaly detection
        features = ['diff', 'hour', 'day_of_week', 'rolling_mean', 'rolling_std'] if 'helios' in file_path else ['diff', 'day_of_week', 'rolling_mean', 'rolling_std']

        # Handle NaN values
        df[features] = df[features].fillna(df[features].mean())

        X = df[features].values

        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Initialize results dictionary
        results = {}

        # Anomaly Detection Models
        for model_name, model in models.items():
            model.fit(X_scaled)
            outlier_scores = model.decision_function(X_scaled)
            df[f'{model_name}_anomaly_score'] = outlier_scores
            df[f'{model_name}_is_anomaly'] = model.predict(X_scaled)
            z_score_threshold = 1 #3 for helios
            df[f'{model_name}_is_validated_anomaly'] = (df[f'{model_name}_is_anomaly'] == 1) & (abs(df['z_score']) > z_score_threshold)
            results[model_name] = df[f'{model_name}_is_validated_anomaly'].sum()

        # Create a new column for points that are anomalies according to all methods
        df['all_methods_anomaly'] = df[[f'{model_name}_is_validated_anomaly' for model_name in models]].all(axis=1)
        results['all_methods'] = df['all_methods_anomaly'].sum()

        return df, results
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None, None

def plot_results(df, user_key, dataset_type, results):
    try:
        plt.figure(figsize=(12, 6))
        x = df['datetime']
        y = df['diff'] if 'helios' in dataset_type else df['Pulse1']
        title = f'Anomalies Detected by All Methods for {user_key}'

        plt.plot(x, y, label='Consumption', alpha=0.5)

        all_methods_anomalies = df[df['all_methods_anomaly']]
        plt.scatter(all_methods_anomalies[x.name], all_methods_anomalies[y.name], label='Anomalies (All Methods)', color='red')

        plt.title(title)
        plt.xlabel('DateTime')
        plt.ylabel('Consumption' if 'helios' in dataset_type else 'Consumption')
        plt.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Error plotting results: {str(e)}")

def main(folder_path, dataset_type, contamination=0.01):
    print(f"Starting main function with folder_path: {folder_path}")
    # Define models
    models = {
        'IForest': IForest(contamination=contamination, random_state=42),
        'KNN': KNN(contamination=contamination, n_neighbors=5),
        'LOF': LOF(contamination=contamination),
        'AutoEncoder': AutoEncoder(contamination=contamination)
    }

    try:
        file_list = os.listdir(folder_path)
        print(f"Files in directory: {file_list}")

        for filename in file_list:
            if filename.endswith('.csv'):
                file_path = os.path.join(folder_path, filename)

                df, results = process_file(file_path, contamination, models)

                if df is not None and results is not None:
                    user_key = df['user key'].iloc[0] if 'helios' in dataset_type else filename.split('.')[0]

                    print(f"Processing data for file: {filename}")
                    print(f"Total data points: {len(df)}")
                    for model_name, count in results.items():
                        print(f"{model_name} validated anomalies: {count}")

                    # Plot the results
                    plot_results(df, user_key, dataset_type, results)
                else:
                    print(f"Skipping file {filename} due to processing error")
    except Exception as e:
        print(f"Error in main function: {str(e)}")

# Get user input for dataset type
dataset_type = input("Enter dataset type (helios/queensland): ").lower()
while dataset_type not in ['helios', 'queensland']:
    dataset_type = input("Invalid input. Please enter 'helios' or 'queensland': ").lower()

# Set the folder path based on the dataset type
if dataset_type == 'helios':
    folder_path = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/helios/user dataset/'
else:
    folder_path = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/queensland/sorted_pulse'  # Replace with the actual path

print(f"Selected dataset type: {dataset_type}")
print(f"Folder path: {folder_path}")

try:
    main(folder_path, dataset_type, contamination=0.1)
except Exception as e:
    print(f"An error occurred: {str(e)}")

In [None]:
import os
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.auto_encoder import AutoEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def load_and_preprocess_data(folder_path, dataset_type):
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path, sep=';') if 'helios' in dataset_type else pd.read_csv(file_path)

            df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S') if 'helios' in dataset_type else pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S')
            df = df.sort_values('datetime') if 'helios' in dataset_type else df.sort_values('time')

            df['hour'] = df['datetime'].dt.hour
            df['day_of_week'] = df['datetime'].dt.dayofweek

            window_size = 24 if 'helios' in dataset_type else 7
            df['diff'] = df['diff'] if 'helios' in dataset_type else df['Value'].diff()
            df['rolling_mean'] = df['diff'].rolling(window=window_size).mean()
            df['rolling_std'] = df['diff'].rolling(window=window_size).std()

            df['z_score'] = (df['diff'] - df['rolling_mean']) / df['rolling_std']

            features = ['diff', 'hour', 'day_of_week', 'rolling_mean', 'rolling_std'] if 'helios' in dataset_type else ['diff', 'day_of_week', 'rolling_mean', 'rolling_std']
            df[features] = df[features].fillna(df[features].mean())

            all_data.append(df[features])

    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

def train_models(X, contamination=0.01):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    models = {
        'IForest': IForest(contamination=contamination, random_state=42),
        'KNN': KNN(contamination=contamination, n_neighbors=5),
        'LOF': LOF(contamination=contamination),
        'AutoEncoder': AutoEncoder(contamination=contamination)
    }

    for model_name, model in models.items():
        print(f"Training {model_name}...")
        model.fit(X_scaled)

    return models, scaler

def predict_anomalies(df, models, scaler, dataset_type):
    features = ['diff', 'hour', 'day_of_week', 'rolling_mean', 'rolling_std'] if 'helios' in dataset_type else ['diff', 'day_of_week', 'rolling_mean', 'rolling_std']
    X = df[features].values
    X_scaled = scaler.transform(X)

    results = {}
    for model_name, model in models.items():
        outlier_scores = model.decision_function(X_scaled)
        df[f'{model_name}_anomaly_score'] = outlier_scores
        df[f'{model_name}_is_anomaly'] = model.predict(X_scaled)
        z_score_threshold = 3
        df[f'{model_name}_is_validated_anomaly'] = (df[f'{model_name}_is_anomaly'] == 1) & (abs(df['z_score']) > z_score_threshold)
        results[model_name] = df[f'{model_name}_is_validated_anomaly'].sum()

    return df, results

def plot_results(df, user_key, dataset_type, results):
    plt.figure(figsize=(12, 6))
    x = df['datetime'] if 'helios' in dataset_type else df['time']
    y = df['diff']
    title = f'Validated Anomalies for {user_key}'

    plt.plot(x, y, label='Consumption', alpha=0.5)

    for model_name in results:
        validated_anomalies = df[df[f'{model_name}_is_validated_anomaly']]
        plt.scatter(validated_anomalies[x.name], validated_anomalies[y.name], label=f'{model_name} Validated Anomalies')

    plt.title(title)
    plt.xlabel('DateTime')
    plt.ylabel('Consumption Difference')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def main(folder_path, dataset_type, contamination=0.01):
    # Load and preprocess all data
    print("Loading and preprocessing data...")
    all_data = load_and_preprocess_data(folder_path, dataset_type)

    # Train models on all data
    print("Training models on all data...")
    models, scaler = train_models(all_data, contamination)

    # Predict anomalies for each file
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)

            # Load and preprocess individual file
            df = pd.read_csv(file_path, sep=';') if 'helios' in dataset_type else pd.read_csv(file_path)
            df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S') if 'helios' in dataset_type else pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S')
            df = df.sort_values('datetime') if 'helios' in dataset_type else df.sort_values('time')

            df['hour'] = df['datetime'].dt.hour
            df['day_of_week'] = df['datetime'].dt.dayofweek

            window_size = 24 if 'helios' in dataset_type else 7
            df['diff'] = df['diff'] if 'helios' in dataset_type else df['Value'].diff()
            df['rolling_mean'] = df['diff'].rolling(window=window_size).mean()
            df['rolling_std'] = df['diff'].rolling(window=window_size).std()

            df['z_score'] = (df['diff'] - df['rolling_mean']) / df['rolling_std']

            # Predict anomalies
            df, results = predict_anomalies(df, models, scaler, dataset_type)

            user_key = df['user key'].iloc[0] if 'helios' in dataset_type else filename

            print(f"\nProcessing data for file: {filename}")
            print(f"Total data points: {len(df)}")
            for model_name, count in results.items():
                print(f"{model_name} validated anomalies: {count}")

            # Plot the results
            plot_results(df, user_key, dataset_type, results)

# Get user input for dataset type
dataset_type = input("Enter dataset type (helios/queensland): ").lower()
while dataset_type not in ['helios', 'queensland']:
    dataset_type = input("Invalid input. Please enter 'helios' or 'queensland': ").lower()

# Set the folder path based on the dataset type
if dataset_type == 'helios':
    folder_path = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/helios/user dataset/'
else:
    folder_path = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/helios/user dataset/'  # Replace with the actual path for Queensland data

main(folder_path, dataset_type, contamination=0.01)

TRAIN WITH WHOLE DATASET

In [4]:
%cd /content/
%ls

/content
[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
import os
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.auto_encoder import AutoEncoder
from sklearn.preprocessing import StandardScaler
import joblib

def process_file(file_path, contamination=0.01):
    print(f"Processing file: {file_path}")
    try:
        # Read the CSV file
        df = pd.read_csv(file_path, sep=';')
        df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
        df = df.sort_values('datetime')
        value_column = 'diff'

        print(f"DataFrame shape: {df.shape}")
        print(f"Columns: {df.columns}")

        # Extract hour and day of week
        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.dayofweek

        # Calculate rolling statistics
        window_size = 24
        df['diff'] = df[value_column] if 'diff' in df.columns else df[value_column].diff()
        df['rolling_mean'] = df['diff'].rolling(window=window_size).mean()
        df['rolling_std'] = df['diff'].rolling(window=window_size).std()

        # Calculate Z-scores
        df['z_score'] = (df['diff'] - df['rolling_mean']) / df['rolling_std']

        # Prepare features for anomaly detection
        features = ['diff', 'hour', 'day_of_week', 'rolling_mean', 'rolling_std']

        # Handle NaN values
        df[features] = df[features].fillna(df[features].mean())

        # Ensure no NaN values remain
        if df[features].isna().sum().sum() > 0:
            raise ValueError("There are still NaN values after filling")

        X = df[features].values

        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        return X_scaled
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

def train_and_save_models(folder_path, dataset_type, contamination=0.01):
    print(f"Starting training with folder_path: {folder_path}")

    # Define models
    models = {
        'IForest': IForest(contamination=contamination, random_state=42),
        'KNN': KNN(contamination=contamination, n_neighbors=5),

        #'LOF': LOF(contamination=contamination),
        #'AutoEncoder': AutoEncoder(contamination=contamination)
    }

    try:
        file_list = os.listdir(folder_path)
        print(f"Files in directory: {file_list}")

        X_combined = []

        for filename in file_list:
            if filename.endswith('.csv'):
                file_path = os.path.join(folder_path, filename)

                X_scaled = process_file(file_path, contamination)
                if X_scaled is not None:
                    X_combined.append(X_scaled)

        if X_combined:
            X_combined = np.vstack(X_combined)
            print(f"Combined feature matrix shape: {X_combined.shape}")

            for model_name, model in models.items():
                model.fit(X_combined)
                joblib.dump(model, f'{model_name}_model.pkl')
                print(f"Saved {model_name} model")

    except Exception as e:
        print(f"Error in training and saving models: {str(e)}")

# Set the folder path based on the dataset type
dataset_type = input("Enter dataset type (helios/queensland): ").lower()
while dataset_type not in ['helios', 'queensland']:
    dataset_type = input("Invalid input. Please enter 'helios' or 'queensland': ").lower()

if dataset_type == 'helios':
    folder_path = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/helios/user dataset/'
else:
    folder_path = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/queensland/sorted_pulsetot/'  # Replace with the actual path

print(f"Selected dataset type: {dataset_type}")
print(f"Folder path: {folder_path}")

try:
    train_and_save_models(folder_path, dataset_type, contamination=0.1)
except Exception as e:
    print(f"An error occurred: {str(e)}")


In [2]:
import os
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.auto_encoder import AutoEncoder
from sklearn.preprocessing import StandardScaler
import joblib

def process_file(file_path):
    print(f"Processing file: {file_path}")
    try:
        df = pd.read_csv(file_path, sep=';')
        df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y %H:%M:%S')
        df = df.sort_values('datetime')
        value_column = 'diff'

        print(f"DataFrame shape: {df.shape}")
        print(f"Columns: {df.columns}")

        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.dayofweek

        window_size = 24
        df['diff'] = df[value_column] if 'diff' in df.columns else df[value_column].diff()
        df['rolling_mean'] = df['diff'].rolling(window=window_size).mean()
        df['rolling_std'] = df['diff'].rolling(window=window_size).std()

        df['z_score'] = (df['diff'] - df['rolling_mean']) / df['rolling_std']

        features = ['diff', 'hour', 'day_of_week', 'rolling_mean', 'rolling_std']

        df[features] = df[features].fillna(df[features].mean())

        if df[features].isna().sum().sum() > 0:
            raise ValueError("There are still NaN values after filling")

        X = df[features].values

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        print(f"X_scaled shape: {X_scaled.shape}")

        return X_scaled
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

def load_models(models_dir):
    models = {}
    for filename in os.listdir(models_dir):
        if filename.endswith('_model.pkl'):
            model_name = filename.replace('_model.pkl', '')
            model_path = os.path.join(models_dir, filename)
            models[model_name] = joblib.load(model_path)
            print(f"Loaded {model_name} model from {model_path}")
    return models

def use_models_on_data(folder_path, models_dir):
    print(f"Starting with folder_path: {folder_path}")

    models = load_models(models_dir)

    try:
        file_list = os.listdir(folder_path)
        print(f"Files in directory: {file_list}")

        X_combined = []

        for filename in file_list:
            if filename.endswith('.csv'):
                file_path = os.path.join(folder_path, filename)
                X_scaled = process_file(file_path)
                if X_scaled is not None:
                    X_combined.append(X_scaled)

        if X_combined:
            X_combined = np.vstack(X_combined)
            print(f"Combined feature matrix shape: {X_combined.shape}")

            for model_name, model in models.items():
                print(f"Using {model_name} model...")
                predictions = model.predict(X_combined)
                print(f"Predictions using {model_name} model: {predictions}")
        else:
            print("No data to process.")

    except Exception as e:
        print(f"Error in processing data with models: {str(e)}")

# Set the folder path based on the dataset type
dataset_type = input("Enter dataset type (helios/queensland): ").lower()
while dataset_type not in ['helios', 'queensland']:
    dataset_type = input("Invalid input. Please enter 'helios' or 'queensland': ").lower()

if dataset_type == 'helios':
    folder_path = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/helios/user dataset/'
else:
    folder_path = '/content/drive/MyDrive/kentkart/water meter dataset/dataset/queensland/sorted_pulsetot/'

print(f"Selected dataset type: {dataset_type}")
print(f"Folder path: {folder_path}")

# Directory containing existing models
models_dir = './'

try:
    use_models_on_data(folder_path, models_dir)
except Exception as e:
    print(f"An error occurred: {str(e)}")

ModuleNotFoundError: No module named 'pyod'

In [46]:
%cd /content/

/content


In [None]:
from google.colab import files
files.download('/content/IForest_model.pkl')
files.download('/content/KNN_model.pkl')
files.download('/content/LOF_model.pkl')
files.download('/content/AutoEncoder_model.pkl')
