In [4]:
import yaml
yaml.warnings({'YAMLLoadWarning': False})
import os

with open("./config.yaml", 'rb') as f:
    config = yaml.load(f)

In [5]:
OUTPUT_ROOT = config['IO_OPTION']['OUTPUT_ROOT']

# load library

In [6]:
# python default library
import os
import shutil
import datetime
import sys
import pickle

# general analysis tool-kit
import numpy as np
import pandas as pd
#from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# original library
sys.path.append('/home/hiroki/research/dcase2021_task2/src/functions')
sys.path.append('../')
import common as com
import pytorch_modeler as modeler

# etc
import yaml
yaml.warnings({'YAMLLoadWarning': False})

# ML lib
from sklearn.preprocessing import StandardScaler
from umap import UMAP
from sklearn.mixture import GaussianMixture
#from sklearn.covariance import LedoitWolf
from scipy.spatial.distance import mahalanobis


import librosa
import IPython
import librosa.display

# load config and set logger

In [7]:
log_folder = config['IO_OPTION']['OUTPUT_ROOT']+'/{0}.log'.format(datetime.date.today())
logger = com.setup_logger(log_folder, '00_train.py')

# Setting

In [8]:
# Setting seed
modeler.set_seed(42)

In [9]:
############################################################################
# Setting I/O path
############################################################################
# input dirs
INPUT_ROOT = config['IO_OPTION']['INPUT_ROOT']
dev_dir = INPUT_ROOT + "/dev_data"
add_dev_dir = INPUT_ROOT + "/add_dev_data"
# machine type
machine_types = os.listdir(dev_dir)
# output dirs
OUTPUT_ROOT = config['IO_OPTION']['OUTPUT_ROOT']
MODEL_DIR = config['IO_OPTION']['OUTPUT_ROOT'] + '/models'
TB_DIR = config['IO_OPTION']['OUTPUT_ROOT'] + '/tb'
OUT_FEATURE_DIR = OUTPUT_ROOT + '/extraction_features'
#os.makedirs(OUTPUT_ROOT, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(TB_DIR, exist_ok=True)
os.makedirs(OUT_FEATURE_DIR, exist_ok=True)
# copy config
shutil.copy('./config.yaml', OUTPUT_ROOT)

'/media/hiroki/HDD1TB/research/dcase2021_task2/output/MahalanobisAD_by_GMM/config.yaml'

# load data

In [10]:
machine_types

['fan', 'gearbox', 'pump', 'slider', 'ToyCar', 'ToyTrain', 'valve']

In [11]:
data_types = ['train', 'valid_source', 'valid_target']

In [12]:
def load_ext_data(machine_type):
    input_path = f'{OUT_FEATURE_DIR}/{machine_type}_features.pkl'
    ext_data = pd.read_pickle(input_path)
    
    return ext_data

# evaluation

## estimate

## GMM param tune

In [13]:
machine_types

['fan', 'gearbox', 'pump', 'slider', 'ToyCar', 'ToyTrain', 'valve']

In [17]:
n_components_list = [2]

In [18]:
for machine_type in machine_types[5:]:
    logger.info(f'====== {machine_type} ======')
    ext_data = load_ext_data(machine_type)
    outdir = f'{MODEL_DIR}/{machine_type}'
    os.makedirs(outdir, exist_ok=True)
    bic_list = []
    for n_components in n_components_list:
        logger.info(f'n_components = {n_components}')
        gmm = GaussianMixture(n_components=n_components, random_state=42)
        gmm.fit(ext_data['train']['features'])
        bic = gmm.bic(ext_data['train']['features'])
        logger.info(f'{bic}')
        pd.to_pickle(gmm, f'{outdir}/{machine_type}_model_n={n_components}.pkl')

2021-04-09 18:27:51,213 - 00_train.py - INFO - n_components = 2
2021-04-09 18:29:19,605 - 00_train.py - INFO - -34582849.173225015
2021-04-09 18:30:35,302 - 00_train.py - INFO - n_components = 2
2021-04-09 18:38:15,374 - 00_train.py - INFO - -34460783.09866527


In [None]:
gmm.bic(ext_data['train']['features'])

In [None]:
pred = gmm.predict(ext_data['train']['features'])

In [None]:
plt.hist(pred)

In [None]:
gmm_covs = gmm.covariances_
gmm_means = gmm.means_

In [None]:
gmm_covs = gmm.covariances_
gmm_means = gmm.means_


# calc mahalanobis (Anomaly Score)
valid_source_dists = [mahalanobis(sample, mean, cov_inv) for sample in ext_data['valid_source']['features']]
valid_target_dists = [mahalanobis(sample, mean, cov_inv) for sample in ext_data['valid_target']['features']]

In [None]:
for machine_type in machine_types:
    # get MVG
    mean = MVG[machine_type]['mean']
    cov_inv = MVG[machine_type]['cov']
    # load data
    ext_data = load_ext_data(machine_type)
    # calc mahalanobis (Anomaly Score)
    valid_source_dists = [mahalanobis(sample, mean, cov_inv) for sample in ext_data['valid_source']['features']]
    valid_target_dists = [mahalanobis(sample, mean, cov_inv) for sample in ext_data['valid_target']['features']]
    # np.array
    valid_source_dists = np.array(valid_source_dists)
    valid_target_dists = np.array(valid_target_dists)
    # calc AUC
    roc_auc = roc_auc_score(ext_data['valid_source']['labels'], valid_source_dists)
    logger.info(f'{machine_type} valid_source AUC : {roc_auc}')
    roc_auc = roc_auc_score(ext_data['valid_target']['labels'], valid_target_dists)
    logger.info(f'{machine_type} valid_target AUC : {roc_auc}')

In [None]:
gmm_section_types = gmm.predict(feats)

In [None]:
sns.distplot(section_types)

In [None]:
sns.displot(gmm_section_types)