In [35]:
import warnings

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.mixture import GaussianMixture

from src.features import build_features_final
from src.config.config import seed_everything, cfg

warnings.filterwarnings(action='ignore')
seed_everything(cfg.SEED)

In [38]:
def find_best_n_components(data, max_components=10):
    best_n_components = 0
    best_aic = float('inf')
    n_components_range = range(1, max_components + 1)

    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components, random_state=cfg.SEED)
        gmm.fit(data)
        aic = gmm.aic(data)
        if aic < best_aic:
            best_aic = aic
            best_n_components = n_components

    print(f"Best n_components: {best_n_components}")
    return best_n_components

In [39]:
scaler = MinMaxScaler()

train_data = pd.read_csv(r'data\raw\train_data.csv')
train_data = build_features_final.create_derived_features(train_data)

test_data = pd.read_csv(r'data\raw\test_data.csv')
test_data = build_features_final.create_derived_features(test_data)

scaled_train_data = scaler.fit_transform(train_data)
scaled_test_data = scaler.transform(test_data)

scaled_train_data = pd.DataFrame(scaled_train_data, columns=train_data.columns)
scaled_test_data = pd.DataFrame(scaled_test_data, columns=train_data.columns)

scaled_train_data_df = scaled_train_data.copy()
scaled_test_data_df = scaled_test_data.copy()

drop_feature = ['type', 'motor_hp', 'air_end_temp', 'motor_rpm', 'motor_temp', 'motor_vibe', 'motor_current',
                'air_inflow', "air_flow_pressure", "current_by_vibration", "airflow_per_rotation", "air_to_motor_ratio"]


grouped_train = scaled_train_data.groupby('type')

train_anomaly = []
test_anomaly = []

for group_name, group_data in grouped_train:
    test_group = scaled_test_data[scaled_test_data['type'] == group_name]
    train_group = group_data.drop(drop_feature, axis=1).values
    test_group = test_group.drop(drop_feature, axis=1).values

    best_n_components = find_best_n_components(train_group)
    gmm = GaussianMixture(n_components=best_n_components, random_state=cfg.SEED)
    gmm.fit(train_group)
    train_mean_distance = (((train_group - gmm.means_[gmm.predict(train_group)])) ** 2).sum(axis=1)
    test_mean_distance = (((test_group - gmm.means_[gmm.predict(test_group)])) ** 2).sum(axis=1)

    train_anomaly.append(train_mean_distance)
    test_anomaly.append(test_mean_distance)
    print(f"finish {group_name}type")

train_anomaly = np.concatenate(train_anomaly)
test_anomaly = np.concatenate(test_anomaly)


pd.DataFrame(train_anomaly, columns=['GaussianMixture']).to_csv('Train_GaussianMixture.csv', index=False)
pd.DataFrame(test_anomaly, columns=['GaussianMixture']).to_csv('Test_GaussianMixture.csv', index=False)

Best n_components: 5
finish 0.0type
Best n_components: 7
finish 0.14285714285714285type
Best n_components: 4
finish 0.2857142857142857type
Best n_components: 4
finish 0.42857142857142855type
Best n_components: 4
finish 0.5714285714285714type
Best n_components: 4
finish 0.7142857142857142type
Best n_components: 3
finish 0.8571428571428571type
Best n_components: 4
finish 1.0type


---

In [42]:
from sklearn.neighbors import NearestNeighbors

def knn_based_outlier_detection(train_data, test_data, k=5):
    """
    :param train_data: feature matrix of training data, shape: (n_samples, n_features)
    :param test_data: feature matrix of test data, shape: (n_samples, n_features)
    :param k: number of nearest neighbors
    :param threshold: outlier threshold
    :return: outlier scores for each point in the test data
    """
    nbrs = NearestNeighbors(n_neighbors=k+1).fit(train_data)  # including itself
    
    train_distances, _ = nbrs.kneighbors(train_data)
    test_distances, _ = nbrs.kneighbors(test_data)

    
    train_mean_distance_to_k_neighbors = train_distances[:, 1:].mean(axis=1)
    train_max_distance_to_k_neighbors = train_distances[:, 1:].max(axis=1)
    train_outlier_scores = train_max_distance_to_k_neighbors - train_mean_distance_to_k_neighbors

    test_mean_distance_to_k_neighbors = test_distances[:, 1:].mean(axis=1)
    test_max_distance_to_k_neighbors = test_distances[:, 1:].max(axis=1)
    test_outlier_scores = test_max_distance_to_k_neighbors - test_mean_distance_to_k_neighbors
    
    return train_outlier_scores, test_outlier_scores


In [None]:
pd.DataFrame(train_outlier_scores, columns=['KNN']).to_csv('Train_KNN.csv', index=False)
pd.DataFrame(test_outlier_scores, columns=['KNN']).to_csv('Test_KNN.csv', index=False)