In [1]:
# Cell 1
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import io
import os
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import random
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import load_model
from keras.utils import custom_object_scope
from tensorflow.keras.layers import Bidirectional, SimpleRNN, Dense

In [7]:
# Cell 2
# Updated function
# Updated function without saving data inside
def upload_and_process_files(directory, num_files_to_use=None):
    svc_files = [f for f in os.listdir(directory) if f.endswith('.svc')]
    
    # If num_files_to_use is specified, only take that many files sequentially
    if num_files_to_use:
        svc_files = svc_files[:num_files_to_use]  # Take the first num_files_to_use files

    data_frames = []  # Processed data after scaling
    original_data_frames = []  # Save the original unscaled data
    scalers = []
    input_filenames = []  # List to store input filenames

    num_files = len(svc_files)
    

    for i, filename in enumerate(svc_files):
        file_path = os.path.join(directory, filename)
        input_filenames.append(filename)  # Store the filename
        df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
        df.columns = ['x', 'y', 'timestamp', 'pen_status', 'pressure', 'azimuth', 'altitude']
        
        # Modify timestamp to start from 0
        df['timestamp'] = (df['timestamp'] - df['timestamp'].min()).round().astype(int)
        
        # Keep a copy of the original data before scaling
        original_data_frames.append(df.copy())  # Save the original unmodified data
        
        # Process the data for use in the model
        df = df.iloc[:, [0, 1, 2, 3, 4, 5, 6]] 
        data_frames.append(df)
        scaler = MinMaxScaler()
        normalized_data = scaler.fit_transform(df[['x', 'y', 'timestamp']])
        scalers.append(scaler)


    processed_data = [np.column_stack((scaler.transform(df[['x', 'y', 'timestamp']]), df['pen_status'].values)) 
                      for df, scaler in zip(data_frames, scalers)]
    avg_data_points = int(np.mean([df.shape[0] for df in data_frames]))

    return data_frames, processed_data, scalers, avg_data_points, input_filenames, original_data_frames  # Return original data

# Save data outside the function
def save_original_data(data_frames, input_filenames, output_folder='original_absolute'):
    os.makedirs(output_folder, exist_ok=True)
    for df, filename in zip(data_frames, input_filenames):
        save_path = os.path.join(output_folder, filename)
        df.to_csv(save_path, sep=' ', index=False, header=False)

directory = '../../all_datasets/emothaw'  # Directory where the .svc files are stored
num_files_to_use = 903
data_frames, processed_data, scalers, avg_data_points, input_filenames, original_data_frames = upload_and_process_files(directory, num_files_to_use)

# Save the original data after processing
save_original_data(original_data_frames, input_filenames)

# Store the name of the first file for use in Cell 4
input_filename = input_filenames[0] if input_filenames else 'processed_data'
print(f"Number of processed files: {len(processed_data)}")
print(f"Average number of data points: {avg_data_points}")


  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whitespace=True)
  df = pd.read_csv(file_path, skiprows=1, header=None, delim_whi

Number of processed files: 903
Average number of data points: 2939


In [None]:
from glob import glob
import os
import re
import numpy as np
import pandas as pd

def read_svc_file(file_path):
    return pd.read_csv(file_path, sep=' ', header=None, 
                       names=['x', 'y', 'timestamp', 'pen_status', 'pressure', 'azimuth', 'altitude'])

def calculate_nrmse(original, predicted):
    if original.shape != predicted.shape:
        raise ValueError("The shapes of the original and predicted datasets must match.")
    mse = np.mean((original - predicted) ** 2)
    rmse = np.sqrt(mse)
    nrmse = rmse / (np.max(original) - np.min(original))
    return nrmse

def get_matching_augmented_files(original_file, augmented_folder):
    base_name = os.path.basename(original_file)
    base_name_without_ext = os.path.splitext(base_name)[0]
    pattern = os.path.join(augmented_folder, f"synthetic_{base_name_without_ext}*.svc")
    matching_files = glob(pattern)
    
    def sort_key(filename):
        match = re.search(r'\((\d+)\)', filename)
        return int(match.group(1)) if match else -1

    return sorted(matching_files, key=sort_key)

def process_files(original_folder, augmented_folder, input_filenames):
    nrmse_results = {}

    input_filenames_set = {os.path.splitext(os.path.basename(filename))[0] for filename in input_filenames}

    for original_file in glob(os.path.join(original_folder, "*.svc")):
        base_name = os.path.splitext(os.path.basename(original_file))[0]
        
        if base_name not in input_filenames_set:
            continue

        original_data = read_svc_file(original_file)
        matching_augmented_files = get_matching_augmented_files(original_file, augmented_folder)
        
        file_nrmse = []
        for augmented_file in matching_augmented_files:
            augmented_data = read_svc_file(augmented_file)
            
            min_length = min(len(original_data), len(augmented_data))
            original_array = original_data.iloc[:min_length].values
            augmented_array = augmented_data.iloc[:min_length].values

            nrmse = calculate_nrmse(original_array, augmented_array)
            file_nrmse.append(nrmse)

        nrmse_results[os.path.basename(original_file)] = file_nrmse

    return nrmse_results

def save_results_to_log(results, output_path):
    with open(output_path, 'w') as log_file:
        for original_file, nrmse_values in results.items():
            avg_nrmse = None
            if len(nrmse_values) > 1:
                avg_nrmse = np.mean(nrmse_values)

            nrmse_values_str = ", ".join([f"{nrmse:.4f}" for nrmse in nrmse_values])
            log_line = f"{original_file}: NRMSE = {nrmse_values_str}"
            if avg_nrmse is not None:
                log_line += f", Avg NRMSE: {avg_nrmse:.4f}"

            log_file.write(log_line + '\n')
            print(log_line)

original_folder = "original_absolute"
augmented_folder = "../../all_datasets/vaegan_augmented #Directory ng augmented data
output_log_file = "nrmse_results_log.txt"

# Process files and save results
results = process_files(original_folder, augmented_folder, input_filenames)
save_results_to_log(results, output_log_file)

# Calculate overall average NRMSE and standard deviation
all_nrmse_values = [nrmse for nrmse_list in results.values() for nrmse in nrmse_list]
overall_avg_nrmse = np.mean(all_nrmse_values) if all_nrmse_values else None
overall_std_nrmse = np.std(all_nrmse_values) if all_nrmse_values else None

# Print and save the overall average NRMSE and standard deviation
with open(output_log_file, 'a') as log_file:
    if overall_avg_nrmse is not None and overall_std_nrmse is not None:
        log_line = f"\nOverall Average NRMSE: {overall_avg_nrmse:.4f}, Standard Deviation: {overall_std_nrmse:.4f}"
    else:
        log_line = "No NRMSE values calculated."

    log_file.write(log_line + '\n')
    print(log_line)


collection1u00001s00001_hw00001.svc: NRMSE = 0.0082
collection1u00001s00001_hw00002.svc: NRMSE = 0.0114
collection1u00001s00001_hw00003.svc: NRMSE = 0.0203
collection1u00001s00001_hw00004.svc: NRMSE = 0.0256
collection1u00001s00001_hw00005.svc: NRMSE = 0.0048
collection1u00001s00001_hw00006.svc: NRMSE = 0.0232
collection1u00001s00001_hw00007.svc: NRMSE = 0.0142
collection1u00002s00001_hw00001.svc: NRMSE = 0.0112
collection1u00002s00001_hw00002.svc: NRMSE = 0.0201
collection1u00002s00001_hw00003.svc: NRMSE = 0.0286
collection1u00002s00001_hw00004.svc: NRMSE = 0.0028
collection1u00002s00001_hw00005.svc: NRMSE = 0.0030
collection1u00002s00001_hw00006.svc: NRMSE = 0.0243
collection1u00002s00001_hw00007.svc: NRMSE = 0.0141
collection1u00003s00001_hw00001.svc: NRMSE = 0.0160
collection1u00003s00001_hw00002.svc: NRMSE = 0.0231
collection1u00003s00001_hw00003.svc: NRMSE = 0.0297
collection1u00003s00001_hw00004.svc: NRMSE = 0.0016
collection1u00003s00001_hw00005.svc: NRMSE = 0.0018
collection1u

In [None]:
import os
from glob import glob
import numpy as np
import logging
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Configure logging
log_filename = "posthoc_discriminative_score.txt"
logging.basicConfig(
    level=logging.INFO,
    format="%(message)s",
    handlers=[
        logging.FileHandler(log_filename, mode="w", encoding="utf-8"),
        logging.StreamHandler()
    ]
)

def process_files(original_folder, augmented_folder, input_filenames):
    all_real_data = []
    all_synthetic_data = []
    input_filenames_set = {os.path.splitext(os.path.basename(filename))[0] for filename in input_filenames}
    
    for original_file in glob(os.path.join(original_folder, "*.svc")):
        base_name = os.path.splitext(os.path.basename(original_file))[0]
        
        if base_name not in input_filenames_set:
            continue
        
        original_data = read_svc_file(original_file)
        all_real_data.append(original_data.values)
        
        matching_augmented_files = get_matching_augmented_files(original_file, augmented_folder)
        
        for augmented_file in matching_augmented_files:
            augmented_data = read_svc_file(augmented_file)
            all_synthetic_data.append(augmented_data.values)
    
    return np.concatenate(all_real_data), np.concatenate(all_synthetic_data)

def create_lstm_classifier(input_shape):
    model = Sequential([
        LSTM(64, input_shape=input_shape, return_sequences=True),
        LSTM(32),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def prepare_data_for_lstm(real_data, synthetic_data):
    n_features = min(real_data.shape[1], synthetic_data.shape[1])
    real_data_trimmed = real_data[:, :n_features]
    synthetic_data_trimmed = synthetic_data[:, :n_features]
    
    X = np.vstack((real_data_trimmed, synthetic_data_trimmed))
    y = np.concatenate((np.ones(len(real_data)), np.zeros(len(synthetic_data))))
    return X, y

def post_hoc_discriminative_score(real_data, synthetic_data, n_splits=10):
    X, y = prepare_data_for_lstm(real_data, synthetic_data)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    
    for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
        logging.info(f"\nFold {fold}/{n_splits}:")
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
        X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
        
        model = create_lstm_classifier((1, X_train.shape[2]))
        
        # Train model and log epoch progress
        history = model.fit(X_train, y_train, epochs=5, batch_size=512, verbose=0)
        for epoch, metrics in enumerate(history.history["accuracy"], start=1):
            logging.info(f"  Epoch {epoch}: Accuracy = {metrics:.4f}")
        
        y_pred = (model.predict(X_test) > 0.5).astype(int)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
    
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    logging.info(f"\nMean accuracy: {mean_accuracy:.4f}, Standard Deviation: {std_accuracy:.4f}")
    return mean_accuracy, std_accuracy

# Specify your folders
original_folder = "original_absolute"
augmented_folder = "../../all_datasets/vaegan_augmented" #Directory ng augmented data


# Process files
real_data, synthetic_data = process_files(original_folder, augmented_folder, input_filenames)

# Compute post-hoc discriminative score
mean_accuracy, std_accuracy = post_hoc_discriminative_score(real_data, synthetic_data)



Fold 1/10:
  super().__init__(**kwargs)
  Epoch 1: Accuracy = 0.5350
  Epoch 2: Accuracy = 0.5425
  Epoch 3: Accuracy = 0.5364
  Epoch 4: Accuracy = 0.5436
  Epoch 5: Accuracy = 0.5249


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step



Fold 2/10:
  super().__init__(**kwargs)
  Epoch 1: Accuracy = 0.5191
  Epoch 2: Accuracy = 0.5397
  Epoch 3: Accuracy = 0.5586
  Epoch 4: Accuracy = 0.5496
  Epoch 5: Accuracy = 0.5589


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step



Fold 3/10:
  super().__init__(**kwargs)
  Epoch 1: Accuracy = 0.5326
  Epoch 2: Accuracy = 0.5589
  Epoch 3: Accuracy = 0.5312
  Epoch 4: Accuracy = 0.5246
  Epoch 5: Accuracy = 0.5454


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step



Fold 4/10:
  super().__init__(**kwargs)
  Epoch 1: Accuracy = 0.5150
  Epoch 2: Accuracy = 0.5436
  Epoch 3: Accuracy = 0.5380
  Epoch 4: Accuracy = 0.5483
  Epoch 5: Accuracy = 0.5576


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step



Fold 5/10:
  super().__init__(**kwargs)
  Epoch 1: Accuracy = 0.5193
  Epoch 2: Accuracy = 0.5290
  Epoch 3: Accuracy = 0.5380
  Epoch 4: Accuracy = 0.5543
  Epoch 5: Accuracy = 0.5347


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step



Fold 6/10:
  super().__init__(**kwargs)
  Epoch 1: Accuracy = 0.5036
  Epoch 2: Accuracy = 0.5119
  Epoch 3: Accuracy = 0.5273
  Epoch 4: Accuracy = 0.5418
  Epoch 5: Accuracy = 0.5127


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step



Fold 7/10:
  super().__init__(**kwargs)
  Epoch 1: Accuracy = 0.4860
  Epoch 2: Accuracy = 0.5124
  Epoch 3: Accuracy = 0.5326
  Epoch 4: Accuracy = 0.5243
  Epoch 5: Accuracy = 0.5362


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step



Fold 8/10:
  super().__init__(**kwargs)
  Epoch 1: Accuracy = 0.4906
  Epoch 2: Accuracy = 0.5315
  Epoch 3: Accuracy = 0.5354
  Epoch 4: Accuracy = 0.5244
  Epoch 5: Accuracy = 0.5236


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step



Fold 9/10:
  super().__init__(**kwargs)
  Epoch 1: Accuracy = 0.5220
  Epoch 2: Accuracy = 0.5393
  Epoch 3: Accuracy = 0.5485
  Epoch 4: Accuracy = 0.5555
  Epoch 5: Accuracy = 0.5494


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step



Fold 10/10:
  super().__init__(**kwargs)
  Epoch 1: Accuracy = 0.5158
  Epoch 2: Accuracy = 0.5296
  Epoch 3: Accuracy = 0.5275
  Epoch 4: Accuracy = 0.5558
  Epoch 5: Accuracy = 0.5435


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step



Mean accuracy: 0.5497, Standard Deviation: 0.0223


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from glob import glob
import os
import time

def write_to_log(message):
    """Write a message to the log file."""
    log_file = "posthoc_predictive_score.txt"
    with open(log_file, "a") as f:
        f.write(message + "\n")

def read_svc_file(file_path):
    """Read an SVC file and return its contents as a DataFrame."""
    try:
        df = pd.read_csv(file_path, sep=' ', header=None)
        if df.empty:
            return None
        return df
    except Exception as e:
        return None

def get_matching_augmented_file(original_file, augmented_folder):
    """Get matching augmented file for a given original file."""
    base_name = os.path.splitext(os.path.basename(original_file))[0]
    augmented_file = os.path.join(augmented_folder, f"synthetic_{base_name}.svc")
    return augmented_file if os.path.exists(augmented_file) else None

def process_files(original_folder, augmented_folder):
    """Process original and augmented files and prepare them for analysis."""
    paired_data = []
    for original_file in glob(os.path.join(original_folder, "*.svc")):
        augmented_file = get_matching_augmented_file(original_file, augmented_folder)
        if augmented_file is None:
            continue
            
        original_data = read_svc_file(original_file)
        augmented_data = read_svc_file(augmented_file)
        
        if (original_data is not None and augmented_data is not None and 
            original_data.shape[1] == augmented_data.shape[1]):
            paired_data.append((original_file, augmented_file))

    if not paired_data:
        raise ValueError("No valid data pairs found for processing")
    return paired_data

def prepare_sequences(data, time_steps=4):
    """Prepare sequences for prediction, using all but last timestep as input."""
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i + time_steps - 1].flatten())
        y.append(data[i + time_steps - 1])
    return np.array(X), np.array(y)

def evaluate_synthetic_data(original_file, synthetic_file):
    """Evaluate a single synthetic dataset against its original counterpart."""
    original_df = read_svc_file(original_file)
    synthetic_df = read_svc_file(synthetic_file)

    if original_df is None or synthetic_df is None:
        return None

    integer_columns = [0, 1, 4, 5]
    scaler = MinMaxScaler()
    original_scaled = scaler.fit_transform(original_df)
    synthetic_scaled = scaler.transform(synthetic_df)

    start_time = time.time()
    X_train, y_train = prepare_sequences(synthetic_scaled)
    X_test, y_test = prepare_sequences(original_scaled)

    if len(X_train) == 0 or len(X_test) == 0:
        return None

    mapes_per_dim = []
    for dim in range(y_train.shape[1]):
        model = XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42
        )

        model.fit(X_train, y_train[:, dim], verbose=False)
        y_pred = model.predict(X_test)

        y_true_original = original_df.iloc[len(original_df)-len(y_test):, dim].values

        temp_data = np.zeros((len(y_pred), original_df.shape[1]))
        temp_data[:, dim] = y_pred
        y_pred_original = scaler.inverse_transform(temp_data)[:, dim]

        if dim in integer_columns:
            y_pred_original = np.round(y_pred_original)

        if dim == 3:
            y_pred_original = np.round(y_pred_original)

        y_pred_original = np.round(y_pred_original).astype(int)
        y_true_original = np.round(y_true_original).astype(int)

        errors = []
        for true, pred in zip(y_true_original, y_pred_original):
            if true == 0:
                errors.append(100 if pred != 0 else 0)
            else:
                errors.append(min(abs((true - pred) / true) * 100, 100))

        mapes_per_dim.append(np.mean(errors))

    elapsed_time = time.time() - start_time
    return np.mean(mapes_per_dim), elapsed_time

def post_hoc_predictive_score(original_folder, augmented_folder):
    """Calculate post-hoc predictive score for all pairs of original and synthetic data."""
    try:
        paired_files = process_files(original_folder, augmented_folder)
        all_results = []
        
        for original_file, synthetic_file in paired_files:
            result = evaluate_synthetic_data(original_file, synthetic_file)
            if result is not None:
                mape, elapsed_time = result
                file_name = os.path.basename(original_file)
                log_message = (f"Processing: {file_name} Completed in {elapsed_time:.1f}s "
                               f"(MAPE: {mape:.2f}%)")
                print(log_message)
                write_to_log(log_message)
                all_results.append(mape)
        
        if not all_results:
            raise ValueError("No valid results calculated")
        
        summary_message = (f"----------------------------------------\n"
                           f"Overall Results:\n"
                           f"Average MAPE: {np.mean(all_results):.2f}%, "
                           f"Standard Deviation: {np.std(all_results):.2f}%")
        print(summary_message)
        write_to_log(summary_message)
        
        return np.mean(all_results), np.std(all_results)
        
    except Exception as e:
        error_message = f"Error: {str(e)}"
        print(error_message)
        write_to_log(error_message)
        return None, None

def main():
    try:
        original_folder = "original_absolute"
        augmented_folder = "../../all_datasets/vaegan_augmented" #Directory ng augmented data
        post_hoc_predictive_score(original_folder, augmented_folder)
            
    except Exception as e:
        error_message = f"Error: {str(e)}"
        print(error_message)
        write_to_log(error_message)

if __name__ == "__main__":
    main()


Processing: collection1u00001s00001_hw00001.svc Completed in 2.6s (MAPE: 7.16%)
Processing: collection1u00001s00001_hw00002.svc Completed in 4.1s (MAPE: 8.86%)
----------------------------------------
Overall Results:
Average MAPE: 8.01%, Standard Deviation: 0.85%
