In [None]:
from pathlib import Path
import os
STUDENT = 'mmr497'

OUTLIERS_PATH = Path('./outliers2/')
INTERMEDIATE_PATH = Path('./intermediate_datafiles/')
os.chdir(f'/home/{STUDENT}/')
EXPERIMENT_DIR = 'ML4QS-Vehicle-2'
from util.VisualizeDataset import VisualizeDataset
from Visualiser import Visualiser as Viz
from outlier_detector import OutlierDetector
from custom_imputer import CustomImputer
from util.util import ignore_actual_time, read_parquet, write_parquet
from DataLoader import PhyboxDatasetLoader

In [None]:
DataViz = VisualizeDataset('example_pipeline.ipynb')
EasyViz = Viz()

In [None]:
# If note done already, load in the Phybox data
dataset_loader = PhyboxDatasetLoader(STUDENT, exp_dir=EXPERIMENT_DIR)
datasets = dataset_loader.create_all_datasets(overwrite=False)
write_parquet(datasets, INTERMEDIATE_PATH / 'ML4QS_combined_results_example.parquet')

In [None]:
# Load intermediate df
intermediate_df = read_parquet(INTERMEDIATE_PATH / 'ML4QS_combined_results_example.parquet')
intermediate_df.head()

In [None]:
# Let the dataset ignore actual timepoints
intermediate_df = ignore_actual_time(intermediate_df)
intermediate_df.head()

In [None]:
# Set outlier detector
out_detector = OutlierDetector(intermediate_df)
# set outlier detector to mixture model, and automatically set outliers to NaN.
# Note that we can pass a col parameter to this function to only apply the outlier detection to a subset of columns.
intermed_df = out_detector.fit_transform("mixture", outlier_behaviour='nan')

In [None]:
# Plot the results
# If you only want to plot specific columns, you can pass a list of columns to the plot_outliers function.
# Because we automatically drop the outlier columns, we use the fitted data from the outlier detector.
outlier_df = out_detector.fitted_data
EasyViz.plot_outliers(outlier_df, outlier_type='mixture', path='outliers_example')

In [None]:
# Create copy of df to do imputation
imputation_df = intermed_df.copy()

# Remove sensor cols we deem bad
bad_sensors = ['proximity', 'location']
drop_cols = []
for sensor in bad_sensors:
    drop_cols.extend(imputation_df.columns[imputation_df.columns.str.contains(sensor)])
imputation_df.drop(drop_cols, axis=1, inplace=True)

In [None]:
# Create imputer object and set imputer method
imputer = CustomImputer('interpolation')
# Fit and transform the imputation_df
imputed_df = imputer.fit_transform(imputation_df)

In [None]:
# Plot the results of the imputation
# We provide both the original dataset and the imputated dataset to show the difference.
# Again, we can specify which cols we want to use.

EasyViz.plot_imputation(imputed_df, imputation_df, imputation_type='interpolation', path='imputation_example')

In [None]:
# Plot the resulting dataframe

EasyViz.plot_dataset(imputed_df, plot_type='full', path='plot_dataset_example')

In [None]:
# You can use plot_dataset for multiple different types of plotting.

imputed_df_orig_time = imputed_df.copy()
imputed_df_orig_time.set_index('original_time', inplace=True)

# Plot per day (requires the original timestamps)
EasyViz.plot_dataset(imputed_df, plot_type='day', path='plot_dataset_example_day')

# Plot per individual instance
# e.g. Only instances with the label train
example_instances = imputed_df[imputed_df['label'] == 'train'].id.unique()
EasyViz.plot_dataset(imputed_df, instances=example_instances,  plot_type='instance', path='plot_dataset_example_instance')

# Plot a specific slice
# Works best with original timestamps.
# TODO: Finish
# EasyViz.plot_dataset(imputed_df, time_slice=(), plot_type='slice', path='plot_dataset_example_slice')