# WHOLE + MACHINE (mean of embeddings)

# weighted mean search

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.stats import hmean
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score, auc, roc_curve

import warnings
warnings.filterwarnings('ignore')

In [2]:
eval_train = pd.read_csv('../processed_data/evaluation_train.csv')
eval_test = pd.read_csv('../processed_data/evaluation_test.csv')
eval_eval = pd.read_csv('../processed_data/evaluation_eval.csv')

In [3]:
eval_test.iloc[0, 0]

'data/gearbox/test/section_00_target_test_normal_0027_noAttribute.wav'

In [4]:
with open('result_emb/train_emb.pickle', 'rb') as f:
    result_train = pickle.load(f)
f.close()

with open('result_emb/test_emb.pickle', 'rb') as f:
    result_test = pickle.load(f)
f.close()

with open('result_emb/eval_emb.pickle', 'rb') as f:
    result_eval = pickle.load(f)
f.close()

##############################################################

with open('result_emb/train_machine_emb.pickle', 'rb') as f:
    result_train_machine = pickle.load(f)
f.close()

with open('result_emb/test_machine_emb.pickle', 'rb') as f:
    result_test_machine = pickle.load(f)
f.close()

with open('result_emb/eval_machine_emb.pickle', 'rb') as f:
    result_eval_machine = pickle.load(f)
f.close()

In [5]:
def length_norm(mat):
    norm_mat = []
    for line in mat:
        temp = line / np.sqrt(sum(np.power(line, 2)))
        norm_mat.append(temp)
    norm_mat = np.array(norm_mat)
    return norm_mat

In [6]:
result_train_ln = length_norm(result_train)
result_test_ln = length_norm(result_test)
result_eval_ln = length_norm(result_eval)

result_train_ln_machine = length_norm(result_train_machine)
result_test_ln_machine = length_norm(result_test_machine)
result_eval_ln_machine = length_norm(result_eval_machine)

In [18]:
weights = [0.62, 0.64, 0.65, 0.67]

for weight in weights:
    print('<<<<<<<[{}]<<<<<<<'.format(weight))
    result_train_mean = result_train_ln*weight + result_train_ln_machine*(1-weight)
    result_test_mean = result_test_ln*weight + result_test_ln_machine*(1-weight)
    result_eval_mean = result_eval_ln*weight + result_eval_ln_machine*(1-weight)

    source_train = np.array(eval_train[eval_train['domain']=='source'].index)
    target_train = np.array(eval_train[eval_train['domain']=='target'].index)

    kmeans = KMeans(n_clusters=16, random_state=42).fit(result_train_mean[source_train])
    centers = kmeans.cluster_centers_

    a = np.min(1-np.dot(result_train_mean, centers.transpose()), axis=-1, keepdims=True)
    b = np.min(1-np.dot(result_train_mean, result_train_mean[target_train].transpose()), axis=-1, keepdims=True)
    cos_train = np.minimum(a, b)
    eval_train['anomaly_score'] = cos_train
    
    a = np.min(1-np.dot(result_test_mean, centers.transpose()), axis=-1, keepdims=True)
    b = np.min(1-np.dot(result_test_mean, result_train_mean[target_train].transpose()), axis=-1, keepdims=True)
    cos_test = np.minimum(a, b)
    eval_test['anomaly_score'] = cos_test
    
    a = np.min(1-np.dot(result_eval_mean, centers.transpose()), axis=-1, keepdims=True)
    b = np.min(1-np.dot(result_eval_mean, result_train_mean[target_train].transpose()), axis=-1, keepdims=True)
    cos_eval = np.minimum(a, b)
    eval_eval['anomaly_score'] = cos_eval

    aucs = []
    p_aucs = []
    aucs_source = []
    p_aucs_source = []
    aucs_target = []
    p_aucs_target = []
    ths = []
    
    machine_list = eval_test['machine'].unique()
    for machine in machine_list:
        auc_source_machine = []
        auc_target_machine = []
        p_auc_machine = []
        temp = eval_test[eval_test['machine']==machine]
        temp.drop(columns='machine', inplace=True)
        temp['audio_path'] = temp['audio_path'].apply(lambda x: x.split('/')[-1])
        temp = temp.sort_values(by='audio_path')
        temp = temp.reset_index(drop=True)
        true = temp['label'].values
        cos = temp['anomaly_score'].values
        fpr, tpr, thresholds = roc_curve(true, cos)
        J = tpr - fpr
        optimal_idx = np.argmax(J)
        optimal_threshold = thresholds[optimal_idx]
        ths.append(optimal_threshold)
        # print('threshold: {}'.format(optimal_threshold))
        decisions = (cos>optimal_threshold).astype(int)
        accuracy = accuracy_score(true, decisions)
        f1 = f1_score(true, decisions)
        recall = recall_score(true, decisions)
        precision = precision_score(true, decisions)
        # print(accuracy, f1, recall, precision)

        accuracy = (decisions==true).sum()/200
        # print('accuracy: {}%'.format(accuracy*100))
        auc = roc_auc_score(true, cos)
        p_auc = roc_auc_score(true, cos, max_fpr=0.1)
        aucs.append(auc)
        p_aucs.append(p_auc)
        # print('AUC of ' + machine + ': ' + str(auc * 100))
        # print('pAUC of ' + machine + ': ' + str(p_auc * 100))
        
            
        temp_source = temp[temp['domain']=='source']
        true_source = temp_source['label'].values
        cos_source = temp_source['anomaly_score'].values
        auc = roc_auc_score(true_source, cos_source)
        p_auc = roc_auc_score(true_source, cos_source, max_fpr=0.1)
        aucs_source.append(auc)
        p_aucs_source.append(p_auc)
        auc_source_machine.append(auc)
        p_auc_machine.append(p_auc)
        # print('AUC for source domain of ' + machine + ': ' + str(auc * 100))
        # print('pAUC for source domain of ' + machine + ': ' + str(p_auc * 100))
            
        temp_target = temp[temp['domain']=='target']
        true_target = temp_target['label'].values
        cos_target = temp_target['anomaly_score'].values
        auc = roc_auc_score(true_target, cos_target)
        p_auc = roc_auc_score(true_target, cos_target, max_fpr=0.1)
        aucs_target.append(auc)
        p_aucs_target.append(p_auc)
        auc_target_machine.append(auc)
        p_auc_machine.append(p_auc)
        # print('AUC for target domain of ' + machine + ': ' + str(auc * 100))
        # print('pAUC for target domain of ' + machine + ': ' + str(p_auc * 100))
        
        print(f'Machine: {machine}, hmean: {hmean(auc_source_machine+auc_target_machine+p_auc_machine)*100:.2f}')
        
        # print('==============================')
        # print('==============================')
        # print('==============================')

    mean_auc = hmean(aucs)
    # print('mean AUC: ' + str(mean_auc * 100))
    mean_p_auc = hmean(p_aucs)
    # print('mean pAUC: ' + str(mean_p_auc * 100))  
    mean_auc_source = hmean(aucs_source)
    # print('mean AUC for source domain: ' + str(mean_auc_source * 100))
    mean_p_auc_source = hmean(p_aucs_source)
    # print('mean pAUC for source domain: ' + str(mean_p_auc_source * 100))
    mean_auc_target = hmean(aucs_target)
    # print('mean AUC for target domain: ' + str(mean_auc_target * 100))
    mean_p_auc_target = hmean(p_aucs_target)
    # print('mean pAUC for target domain: ' + str(mean_p_auc_target * 100))

    score = hmean(aucs_source + aucs_target + p_aucs)
    print(f'final score : {score*100:.2f}')

<<<<<<<[0.62]<<<<<<<
Machine: gearbox, hmean: 56.86
Machine: ToyTrain, hmean: 53.99
Machine: ToyCar, hmean: 48.34
Machine: valve, hmean: 58.44
Machine: slider, hmean: 65.60
Machine: fan, hmean: 57.40
Machine: bearing, hmean: 56.79
final score : 56.68
<<<<<<<[0.64]<<<<<<<
Machine: gearbox, hmean: 56.50
Machine: ToyTrain, hmean: 53.99
Machine: ToyCar, hmean: 48.46
Machine: valve, hmean: 58.27
Machine: slider, hmean: 65.54
Machine: fan, hmean: 57.37
Machine: bearing, hmean: 56.20
final score : 56.60
<<<<<<<[0.65]<<<<<<<
Machine: gearbox, hmean: 56.61
Machine: ToyTrain, hmean: 54.02
Machine: ToyCar, hmean: 48.43
Machine: valve, hmean: 58.13
Machine: slider, hmean: 65.77
Machine: fan, hmean: 57.49
Machine: bearing, hmean: 55.89
final score : 56.64
<<<<<<<[0.67]<<<<<<<
Machine: gearbox, hmean: 56.68
Machine: ToyTrain, hmean: 54.15
Machine: ToyCar, hmean: 48.52
Machine: valve, hmean: 58.13
Machine: slider, hmean: 66.06
Machine: fan, hmean: 57.53
Machine: bearing, hmean: 55.50
final score : 56

In [8]:
weights = [0.62, 0.64, 0.65, 0.67]

for i, weight in enumerate(weights):
    result_train_mean = result_train_ln*weight + result_train_ln_machine*(1-weight)
    result_eval_mean = result_eval_ln*weight + result_eval_ln_machine*(1-weight)

    source_train = np.array(eval_train[eval_train['domain']=='source'].index)
    target_train = np.array(eval_train[eval_train['domain']=='target'].index)

    kmeans = KMeans(n_clusters=16, random_state=42).fit(result_train_mean[source_train])
    centers = kmeans.cluster_centers_
    
    a = np.min(1-np.dot(result_train_mean, centers.transpose()), axis=-1, keepdims=True)
    b = np.min(1-np.dot(result_train_mean, result_train_mean[target_train].transpose()), axis=-1, keepdims=True)
    cos_train = np.minimum(a, b)
    eval_train['anomaly_score'] = cos_train
    
    a = np.min(1-np.dot(result_eval_mean, centers.transpose()), axis=-1, keepdims=True)
    b = np.min(1-np.dot(result_eval_mean, result_train_mean[target_train].transpose()), axis=-1, keepdims=True)
    cos = np.minimum(a, b)

    eval_eval['anomaly_score'] = cos

    for machine in eval_eval['machine'].unique():
        temp = eval_eval[eval_eval['machine']==machine]
        temp.drop(columns=['machine'], inplace=True)
        temp['audio_path'] = temp['audio_path'].apply(lambda x: x.split('/')[-1])
        temp = temp.sort_values(by='audio_path')
        temp = temp.reset_index(drop=True)
        temp.to_csv(f'submission/task2/Kim_CAU_task2_{i+1}/anomaly_score_{machine}_section_00_test.csv', encoding='utf-8', index=False, header=False)
        
    for machine in eval_eval['machine'].unique():
        train = eval_train[eval_train['machine']==machine]
        threshold = np.quantile(train['anomaly_score'], 0.8)
        temp = eval_eval[eval_eval['machine']==machine]
        temp['decisions'] = (temp['anomaly_score']>threshold).astype(int)
        temp.drop(columns=['machine', 'anomaly_score'], inplace=True)
        temp['audio_path'] = temp['audio_path'].apply(lambda x: x.split('/')[-1])
        temp = temp.sort_values(by='audio_path')
        temp = temp.reset_index(drop=True)
        temp.to_csv(f'submission/task2/Kim_CAU_task2_{i+1}/decision_result_{machine}_section_00_test.csv', encoding='utf-8', index=False, header=False)

In [9]:
eval_train['anomaly_score'].describe()

count    16000.000000
mean         0.501621
std          0.146535
min          0.094755
25%          0.441864
50%          0.464686
75%          0.524569
max          0.892598
Name: anomaly_score, dtype: float64

In [10]:
eval_test['anomaly_score'].describe()

count    1400.000000
mean        0.507871
std         0.095051
min         0.387952
25%         0.449427
50%         0.472469
75%         0.523302
max         0.893043
Name: anomaly_score, dtype: float64

In [11]:
eval_eval['anomaly_score'].describe()

count    1800.000000
mean        0.474221
std         0.137229
min         0.096101
25%         0.422052
50%         0.460519
75%         0.512726
max         0.867173
Name: anomaly_score, dtype: float64

In [12]:
for machine in eval_eval['machine'].unique():
    print(machine)
    print(eval_eval[eval_eval['machine']==machine]['anomaly_score'].describe())

ToothBrush
count    200.000000
mean       0.460690
std        0.035413
min        0.416045
25%        0.442232
50%        0.443802
75%        0.470559
max        0.662683
Name: anomaly_score, dtype: float64
Scanner
count    200.000000
mean       0.462111
std        0.006575
min        0.433706
25%        0.460426
50%        0.461020
75%        0.461741
max        0.524528
Name: anomaly_score, dtype: float64
HoveringDrone
count    200.000000
mean       0.455211
std        0.018358
min        0.444622
25%        0.448808
50%        0.450495
75%        0.455517
max        0.634229
Name: anomaly_score, dtype: float64
HairDryer
count    200.000000
mean       0.522754
std        0.124527
min        0.363164
25%        0.425978
50%        0.465295
75%        0.618819
max        0.794475
Name: anomaly_score, dtype: float64
3DPrinter
count    200.000000
mean       0.218738
std        0.076511
min        0.096101
25%        0.119212
50%        0.263072
75%        0.276277
max        0.374105
Nam