In [13]:
from pathlib import Path
import os
from util.VisualizeDataset import VisualizeDataset
STUDENT = 'mmr497'

OUTLIERS_PATH = Path('./outliers2/')
INTERMEDIATE_PATH = Path('./intermediate_datafiles/')
os.chdir(f'/home/{STUDENT}/')

In [5]:
import pandas as pd
import numpy as np
import sys
from Chapter3.OutlierDetection import DistanceBasedOutlierDetection
from Chapter3.ImputationMissingValues import ImputationMissingValues as MisVal

In [74]:
intermed_df = pd.read_parquet(f'{INTERMEDIATE_PATH}/ML4QS_combined_results_2.parquet')

sensor_prefixes = [
    'acc_phone_',
    'lin_acc_phone_',
    'gyr_phone_',
    'mag_phone_',
]

folder = 'intermediate_datafiles'
output_folder = 'outliers2'
vehicles = ['train', 'bus', 'metro', 'tram', 'car','walking']

files = [f for f in os.listdir(folder) if f.endswith('.parquet')]
data_by_vehicle = {vehicle: [] for vehicle in vehicles}

for vehicle in vehicles:
    data_by_vehicle[vehicle] = intermed_df[~intermed_df[f'label{vehicle}'].isna()]

for instance_id in intermed_df['id'].unique():
    # Create a mask for the current instance
    instance_mask = intermed_df['id'] == instance_id
    ins_df = intermed_df[instance_mask]

    start_time = ins_df['timestamp'].iloc[0]
    end_time = ins_df['timestamp'].iloc[-1]

    print(start_time)

    start_time_plus_10s = start_time + pd.Timedelta(seconds=10)
    end_time_minus_10s = end_time - pd.Timedelta(seconds=10)

    time_mask = (intermed_df.timestamp >= start_time_plus_10s) & (intermed_df.timestamp <= end_time_minus_10s)

    intermed_df = intermed_df[~instance_mask | (instance_mask & time_mask)]

outlier_detector = DistanceBasedOutlierDetection()

cols_to_check_acc = ['acc_phone_X', 'acc_phone_Y', 'acc_phone_Z']
cols_to_check_lin_acc = ['lin_acc_phone_X', 'lin_acc_phone_Y', 'lin_acc_phone_Z']
cols_to_check_gyr = ['gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z']
cols_to_check_mag = ['mag_phone_X', 'mag_phone_Y', 'mag_phone_Z']
cols_to_check_accuracy = ['location_phone_Horizontal Accuracy', 'location_phone_Vertical Accuracy']

d_function = 'euclidean'
vehicle_params = {
    'train': {'dmin': 0.15, 'fmin': 0.1},
    'bus': {'dmin': 0.5, 'fmin': 0.1},
    'metro': {'dmin': 0.4, 'fmin': 0.1},
    'tram': {'dmin': 0.45, 'fmin': 0.1},
    'car': {'dmin': 0.1, 'fmin': 0.05},
    'walking': {'dmin': 0.65, 'fmin': 0.1},
}

Path(output_folder).mkdir(parents=True, exist_ok=True)

data_by_vehicle_buffer = {vehicle: [] for vehicle in vehicles}

for vehicle in vehicles:
    data_by_vehicle_buffer[vehicle] = intermed_df[~intermed_df[f'label{vehicle}'].isna()]

for vehicle, df in data_by_vehicle.items():
    raw_output_file = os.path.join(output_folder, f"{vehicle}_raw.parquet")
    df.reset_index().to_parquet(raw_output_file, version='2.6', allow_truncated_timestamps=True)
    print(f"Saved raw {vehicle} data to {raw_output_file}")


for vehicle, df in data_by_vehicle_buffer.items():
    print(f"Processing {vehicle} data...")

    vehicle_dmin = vehicle_params[vehicle]['dmin']
    vehicle_fmin = vehicle_params[vehicle]['fmin']

    # TODO: This is too extreme and marks nearly everything as an outlier.

    df = outlier_detector.simple_distance_based(df, cols=cols_to_check_acc, d_function=d_function, dmin=vehicle_dmin, fmin=vehicle_fmin)
    df.rename(columns={'simple_dist_outlier': f'simple_dist_outlier_acc'}, inplace=True)
    df = outlier_detector.simple_distance_based(df, cols=cols_to_check_lin_acc, d_function=d_function, dmin=vehicle_dmin, fmin=vehicle_fmin)
    df.rename(columns={'simple_dist_outlier': f'simple_dist_outlier_lin_acc'}, inplace=True)
    df = outlier_detector.simple_distance_based(df, cols=cols_to_check_gyr, d_function=d_function, dmin=vehicle_dmin, fmin=vehicle_fmin)
    df.rename(columns={'simple_dist_outlier': f'simple_dist_outlier_gyr'}, inplace=True)
    df = outlier_detector.simple_distance_based(df, cols=cols_to_check_mag, d_function=d_function, dmin=vehicle_dmin, fmin=vehicle_fmin)
    df.rename(columns={'simple_dist_outlier': f'simple_dist_outlier_mag'}, inplace=True)

    df_filtered = df

##################################################
##################################################

    output_file = os.path.join(output_folder, f"{vehicle}_filtered_norem.parquet")
    df_filtered.reset_index().to_parquet(output_file, version='2.6', allow_truncated_timestamps=True)
    print(f"Saved filtered {vehicle} data to {output_file}")

2025-06-06 17:01:46.208000
2025-06-04 11:41:49.154000
2025-06-05 06:50:20.206000
2025-06-06 16:55:53.445000
2025-06-06 17:16:37.687000
2025-06-07 11:19:22.599000
2025-06-04 13:10:41.555000
2025-06-04 17:18:50.211000
2025-06-05 14:30:41.740000
2025-06-04 17:34:24.100000
2025-06-04 13:15:49.935000
2025-06-05 06:29:26.426000
2025-06-04 11:51:49.584000
2025-06-07 11:40:18.155000
Saved raw train data to outliers2/train_raw.parquet
Saved raw bus data to outliers2/bus_raw.parquet
Saved raw metro data to outliers2/metro_raw.parquet
Saved raw tram data to outliers2/tram_raw.parquet
Saved raw car data to outliers2/car_raw.parquet
Saved raw walking data to outliers2/walking_raw.parquet
Processing train data...
Calculating simple distance-based criterion.
Calculating simple distance-based criterion.
Calculating simple distance-based criterion.
Calculating simple distance-based criterion.
Saved filtered train data to outliers2/train_filtered_norem.parquet
Processing bus data...
Calculating simple d

In [6]:
dataframes = []
for filtered_frame in OUTLIERS_PATH.glob('*_filtered_norem.parquet'):
    print(f'processing {filtered_frame}...')
    pd.read_parquet(filtered_frame)
    dataframes.append(pd.read_parquet(filtered_frame))

full_df = pd.concat(dataframes).set_index('index')
full_df.head()

processing outliers2/train_filtered_norem.parquet...
processing outliers2/bus_filtered_norem.parquet...
processing outliers2/metro_filtered_norem.parquet...
processing outliers2/tram_filtered_norem.parquet...
processing outliers2/car_filtered_norem.parquet...
processing outliers2/walking_filtered_norem.parquet...


Unnamed: 0_level_0,id,timestamp,acc_phone_X,acc_phone_Y,acc_phone_Z,lin_acc_phone_X,lin_acc_phone_Y,lin_acc_phone_Z,gyr_phone_X,gyr_phone_Y,...,labelbus,labeltram,labelmetro,labelcar,labelwalking,labeltrain,simple_dist_outlier_acc,simple_dist_outlier_lin_acc,simple_dist_outlier_gyr,simple_dist_outlier_mag
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17693,11,2025-06-05 06:29:36.426,-5.018838,0.043647,-8.829938,-0.197644,-0.049849,-0.252683,-0.000614,0.06651,...,,,,,,1.0,False,True,False,False
17694,11,2025-06-05 06:29:36.676,-4.608881,0.007116,-8.363201,0.159741,-0.068216,0.214748,-0.003934,-0.002731,...,,,,,,1.0,False,True,False,False
17695,11,2025-06-05 06:29:36.926,-4.980305,0.069306,-8.602116,-0.17122,-0.036913,-0.02051,-0.001564,-0.032704,...,,,,,,1.0,False,True,False,False
17696,11,2025-06-05 06:29:37.176,-5.124394,0.085036,-8.764474,-0.321541,-0.036868,-0.228248,-0.00142,0.031546,...,,,,,,1.0,False,True,False,False
17697,11,2025-06-05 06:29:37.426,-4.727055,0.04752,-8.373886,0.048727,-0.042923,0.16951,0.001762,-0.02301,...,,,,,,1.0,False,True,False,False


# START IMPUTATION

In [68]:
print(full_df['simple_dist_outlier'].value_counts())

False    19888
True      4120
Name: simple_dist_outlier, dtype: int64


In [57]:
time_gap_threshold = 250

def time_gap(group):
    time_series = pd.to_datetime(group['timestamp'])
    time_gap_dfs = time_series.diff()
    threshold = pd.Timedelta(milliseconds=time_gap_threshold)
    time_gaps = (time_gap_dfs > threshold).sum()
    return time_gaps



In [56]:
full_df[full_df['id'] == 0]

Unnamed: 0_level_0,id,timestamp,acc_phone_X,acc_phone_Y,acc_phone_Z,lin_acc_phone_X,lin_acc_phone_Y,lin_acc_phone_Z,gyr_phone_X,gyr_phone_Y,...,labelbus,labeltram,labelmetro,labelcar,labelwalking,labeltrain,simple_dist_outlier_acc,simple_dist_outlier_lin_acc,simple_dist_outlier_gyr,simple_dist_outlier_mag
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
40,0,2025-06-06 17:01:56.208,3.553154,-1.717915,-9.362400,0.319031,-0.070037,-0.231236,0.110381,0.044360,...,0.0,,,,,,True,False,False,True
41,0,2025-06-06 17:01:56.458,3.495554,-1.875762,-9.179008,-0.005427,-0.190211,-0.167867,0.054238,-0.061896,...,0.0,,,,,,True,False,False,True
42,0,2025-06-06 17:01:56.708,3.567250,-1.838825,-9.308088,0.141831,-0.036367,-0.314287,-0.033836,0.032542,...,0.0,,,,,,True,False,False,True
43,0,2025-06-06 17:01:56.958,3.096692,-1.588142,-9.583927,-0.266431,0.173306,-0.574200,-0.033063,-0.128087,...,0.0,,,,,,True,False,False,True
44,0,2025-06-06 17:01:57.208,2.818373,-1.703527,-9.596851,-0.317475,-0.031701,-0.470327,0.020498,-0.146702,...,0.0,,,,,,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2065,0,2025-06-06 17:10:22.458,3.389296,-2.046323,-9.254608,0.210976,-0.157438,-0.234655,-0.084520,0.035898,...,0.0,,,,,,True,False,False,True
2066,0,2025-06-06 17:10:22.708,3.680700,-2.283935,-9.304397,0.429455,-0.225437,-0.292450,-0.004559,0.220074,...,0.0,,,,,,True,False,False,True
2067,0,2025-06-06 17:10:22.958,3.847838,-2.463513,-8.986500,0.326076,-0.295219,-0.117644,0.038511,0.029276,...,0.0,,,,,,True,False,False,True
2068,0,2025-06-06 17:10:23.208,3.625858,-2.278765,-9.184454,0.127432,-0.041526,-0.326775,-0.077465,0.022687,...,0.0,,,,,,True,False,False,True


In [44]:
imputation_df = full_df.copy()

cols_to_check_acc = ['acc_phone_X', 'acc_phone_Y', 'acc_phone_Z']
cols_to_check_lin_acc = ['lin_acc_phone_X', 'lin_acc_phone_Y', 'lin_acc_phone_Z']
cols_to_check_gyr = ['gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z']
cols_to_check_mag = ['mag_phone_X', 'mag_phone_Y', 'mag_phone_Z']

checking = {'simple_dist_outlier_acc': cols_to_check_acc,
            'simple_dist_outlier_lin_acc': cols_to_check_lin_acc,
            'simple_dist_outlier_gyr': cols_to_check_gyr,
            'simple_dist_outlier_mag': cols_to_check_mag}

bad_sensors = ['proximity', 'location']
drop_cols = []
for sensor in bad_sensors:
    drop_cols.extend(imputation_df.columns[imputation_df.columns.str.contains(sensor)])

imputation_df.drop(drop_cols, axis=1, inplace=True)

columns_to_convert = [col for col in imputation_df.columns
                     if col not in ['id', 'timestamp', 'simple_dist_outlier']
                     and 'label' not in col]

df_with_nans = imputation_df.copy()

for outlier_col, cols_to_check in checking.items():
    df_with_nans.loc[imputation_df[outlier_col], cols_to_check] = np.nan

imputation_df = df_with_nans

imputation_df.isna().sum()

# gaps_per_instance = imputation_df.groupby('id').apply(time_gap)
# total_gaps = gaps_per_instance
#
# total_gaps

id                                 0
timestamp                          0
acc_phone_X                    17939
acc_phone_Y                    17939
acc_phone_Z                    17939
lin_acc_phone_X                12860
lin_acc_phone_Y                12860
lin_acc_phone_Z                12860
gyr_phone_X                     7183
gyr_phone_Y                     7183
gyr_phone_Z                     7183
mag_phone_X                     4120
mag_phone_Y                     4120
mag_phone_Z                     4120
labelbus                       20143
labeltram                      21550
labelmetro                     20383
labelcar                       16915
labelwalking                   21116
labeltrain                     19933
simple_dist_outlier_acc            0
simple_dist_outlier_lin_acc        0
simple_dist_outlier_gyr            0
simple_dist_outlier_mag            0
dtype: int64

In [51]:
imputer = MisVal()
cols_to_impute = [col for col in imputation_df.columns if col not in ['id', 'timestamp'] and 'label' not in col and 'simple_dist_outlier' not in col]

test = imputation_df.copy()

print("NaN values before imputation:")
print(imputation_df[cols_to_impute].isna().sum())

for instance in test.id.unique():
    instance_mask = test.id == instance
    for col in cols_to_impute:
        if test.loc[instance_mask, col].isna().any():
            print(f"Imputing {col} for instance {instance}...")
            print(test.loc[instance_mask, col].isna().sum() / len(test.loc[instance_mask, col]))
            temp_df = test[instance_mask].copy()
            temp_df = imputer.impute_median(dataset=temp_df, col=col)
            test.loc[instance_mask, col] = temp_df[col]

NaN values before imputation:
acc_phone_X        17939
acc_phone_Y        17939
acc_phone_Z        17939
lin_acc_phone_X    12860
lin_acc_phone_Y    12860
lin_acc_phone_Z    12860
gyr_phone_X         7183
gyr_phone_Y         7183
gyr_phone_Z         7183
mag_phone_X         4120
mag_phone_Y         4120
mag_phone_Z         4120
dtype: int64
Imputing acc_phone_X for instance 11...
0.1894478527607362
Imputing acc_phone_Y for instance 11...
0.1894478527607362
Imputing acc_phone_Z for instance 11...
0.1894478527607362
Imputing lin_acc_phone_X for instance 11...
0.6215950920245399
Imputing lin_acc_phone_Y for instance 11...
0.6215950920245399
Imputing lin_acc_phone_Z for instance 11...
0.6215950920245399
Imputing gyr_phone_X for instance 11...
0.013987730061349693
Imputing gyr_phone_Y for instance 11...
0.013987730061349693
Imputing gyr_phone_Z for instance 11...
0.013987730061349693
Imputing mag_phone_X for instance 11...
0.026503067484662576
Imputing mag_phone_Y for instance 11...
0.02650

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Imputing gyr_phone_Z for instance 2...
0.004360952700436095
Imputing mag_phone_X for instance 2...
0.015431063401543106
Imputing mag_phone_Y for instance 2...
0.015431063401543106
Imputing mag_phone_Z for instance 2...
0.015431063401543106
Imputing acc_phone_X for instance 4...
1.0
Imputing acc_phone_Y for instance 4...
1.0
Imputing acc_phone_Z for instance 4...
1.0
Imputing lin_acc_phone_X for instance 4...
0.6428571428571429
Imputing lin_acc_phone_Y for instance 4...
0.6428571428571429
Imputing lin_acc_phone_Z for instance 4...
0.6428571428571429
Imputing mag_phone_X for instance 4...
0.10093167701863354
Imputing mag_phone_Y for instance 4...
0.10093167701863354
Imputing mag_phone_Z for instance 4...
0.10093167701863354
Imputing acc_phone_X for instance 1...
1.0
Imputing acc_phone_Y for instance 1...
1.0
Imputing acc_phone_Z for instance 1...
1.0
Imputing lin_acc_phone_X for instance 1...
0.22995461422087746
Imputing lin_acc_phone_Y for instance 1...
0.22995461422087746
Imputing lin_

In [48]:
test.isna().sum()

id                                 0
timestamp                          0
acc_phone_X                    17041
acc_phone_Y                    17041
acc_phone_Z                    17041
lin_acc_phone_X                 7093
lin_acc_phone_Y                 7093
lin_acc_phone_Z                 7093
gyr_phone_X                     7093
gyr_phone_Y                     7093
gyr_phone_Z                     7093
mag_phone_X                        0
mag_phone_Y                        0
mag_phone_Z                        0
labelbus                       20143
labeltram                      21550
labelmetro                     20383
labelcar                       16915
labelwalking                   21116
labeltrain                     19933
simple_dist_outlier_acc            0
simple_dist_outlier_lin_acc        0
simple_dist_outlier_gyr            0
simple_dist_outlier_mag            0
dtype: int64

In [24]:
visualise_df = imputation_df.copy()

In [25]:
notebook_path = 'imputation.ipynb'

visualise_df.set_index('timestamp', inplace=True)

DataViz = VisualizeDataset(notebook_path)

# TODO: This should work nicely per instance once the outlier problem is fixed.

# Plot all data
for instance in imputation_df.id.unique():
    visual_instance = visualise_df[visualise_df.id == instance]
    print(visual_instance.head())
    DataViz.plot_dataset(visual_instance,
                        ['acc_', 'gyr_', 'lin_' , 'mag_'],
                        ['like', 'like', 'like', 'like', 'like'],
                        ['line', 'line', 'line', 'line', 'line'])

                         id  acc_phone_X  acc_phone_Y  acc_phone_Z  \
timestamp                                                            
2025-06-05 06:29:36.426  11    -5.018838     0.043647    -8.829938   
2025-06-05 06:29:36.676  11    -4.608881     0.007116    -8.363201   
2025-06-05 06:29:36.926  11    -4.980305     0.069306    -8.602116   
2025-06-05 06:29:37.176  11    -5.124394     0.085036    -8.764474   
2025-06-05 06:29:37.426  11    -4.727055     0.047520    -8.373886   

                         lin_acc_phone_X  lin_acc_phone_Y  lin_acc_phone_Z  \
timestamp                                                                    
2025-06-05 06:29:36.426         0.016264        -0.041538          0.02009   
2025-06-05 06:29:36.676         0.016264        -0.041538          0.02009   
2025-06-05 06:29:36.926         0.016264        -0.041538          0.02009   
2025-06-05 06:29:37.176         0.016264        -0.041538          0.02009   
2025-06-05 06:29:37.426         0.016264 

ValueError: Axis limits cannot be NaN or Inf