In [11]:
from typing import Iterable, List, Dict, Callable
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
PATH_TO_DATA = "/Users/khangtuan/Documents/courses/big_data/-Hydraulic-System-Anomaly-Detection/data"
NUMBER_OF_PROFILES = 2205
PROFILE_MAX_SAMPLE_RATE = 6000
TARGET_NAMES = ["cooler", "valve", "leakage", "accumulator", "stable"]

PLOT_STYLE = "darkgrid"

In [13]:
TARGET_VALUES_MAP = {
    "cooler": {
        3: "close to total failure",
        20: "reduced effifiency",
        100: "full efficiency",
    },
    "valve": {
        100:"optimal switching behavior",
        90: "small lag",
        80: "severe lag",
        73: "close to total failure",
    }, 
    "leakage": {
        0: "no",
	    1: "weak",
	    2: "severe",
    },
    "accumulator": {
        130: "optimal pressure",
    	115: "slightly reduced pressure",
	    100: "severely reduced pressure",
	    90:  "close to total failure",
     },
     "stable":{
        0: "yes",
	    1: "not",
     }
}

In [14]:
sersor_files_config = [
    {"name": "CE", "upsample_coeff": 100},
    {"name": "CP", "upsample_coeff": 100},
    {"name": "EPS1", "upsample_coeff": 1},
    {"name": "FS1", "upsample_coeff": 10},
    {"name": "FS2", "upsample_coeff": 10},
    {"name": "PS1", "upsample_coeff": 1},
    {"name": "PS2", "upsample_coeff": 1},
    {"name": "PS3", "upsample_coeff": 1},
    {"name": "PS4", "upsample_coeff": 1},
    {"name": "PS5", "upsample_coeff": 1},
    {"name": "PS6", "upsample_coeff": 1},
    {"name": "SE", "upsample_coeff": 100},
    {"name": "TS1", "upsample_coeff": 100},
    {"name": "TS2", "upsample_coeff": 100},
    {"name": "TS3", "upsample_coeff": 100},
    {"name": "TS4", "upsample_coeff": 100},
    {"name": "VS1", "upsample_coeff": 100},
]

In [15]:
def get_files_with_resample(config: List[Dict]) -> Iterable[np.ndarray]:
    for file in config:
        data = np.genfromtxt(PATH_TO_DATA + file["name"] + ".txt", dtype=float, delimiter='\t')
        yield np.repeat(data, file["upsample_coeff"], axis=1).flatten()

In [16]:
def load_feature_dataframe(config: List[Dict]) -> pd.DataFrame:
    columns = [file["name"] for file in config]
    data = np.stack(get_files_with_resample(config), axis=-1)
    data_df = pd.DataFrame(data, columns=columns)

    prodile_ids = np.repeat(range(1, NUMBER_OF_PROFILES+1), PROFILE_MAX_SAMPLE_RATE)
    prodile_ids_df = pd.DataFrame(prodile_ids, columns=["profile_id"])

    return pd.concat([prodile_ids_df, data_df], axis=1, sort=False)

In [17]:
def load_targets(filename: str) -> pd.DataFrame:
    conditions_data = np.genfromtxt(PATH_TO_DATA + filename, dtype=int, delimiter='\t')
    conditions_df = pd.DataFrame(conditions_data, columns=TARGET_NAMES)

    prodile_ids = range(1, NUMBER_OF_PROFILES+1)
    prodile_ids_df = pd.DataFrame(prodile_ids, columns=["profile_id"])

    return pd.concat([prodile_ids_df, conditions_df], axis=1, sort=False)

In [18]:
feature_df = load_feature_dataframe(sersor_files_config)
target_df = load_targets("profile.txt") #Note that targets is define for profile, not to points

_ = gc.collect()

TypeError: arrays to stack must be passed as a "sequence" type such as list or tuple.