In [1]:
from itertools import islice
import csv
from pathlib import Path

import numpy as np
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope

In [2]:
def conv_name(old_name):
    file_path, method_sign = old_name.split('.txt,')
    file_path = file_path + '.kt'
    return file_path + ':' + method_sign

In [3]:
data_path = ''
names_path = ''
size_limit = 200000

X = np.loadtxt(data_path, delimiter=' ', max_rows=size_limit, dtype=np.float32)

with open(names_path, encoding='utf8') as names_file:
    names = [conv_name(i.strip()) for i in islice(names_file, size_limit)]

no_nans_mask = ~np.isnan(X).any(axis=1)
no_nans_indices = np.argwhere(no_nans_mask).flatten()
X = X[no_nans_mask]
names = [names[i] for i in no_nans_indices]

pca = PCA(64)
X = pca.fit_transform(X)
print(pca.explained_variance_ratio_.sum())

0.9258978050096849


In [4]:
def save_anomalies(anomaly_indices, save_path):
    with save_path.open('wt') as out_file:
        out_writer = csv.writer(out_file, delimiter=',')
        for anomaly_index in anomaly_indices:
            out_writer.writerow([anomaly_index + 1, names[anomaly_index]])

In [5]:
saving_dir = Path('')
saving_dir.mkdir(parents=True, exist_ok=True)

In [12]:
lof_clf = LocalOutlierFactor(n_neighbors=10, algorithm='auto', metric='cosine', p=2, contamination=0.0005, n_jobs=-1)
lof_marks = lof_clf.fit_predict(X)

lof_anomalies = np.argwhere(lof_marks < 0).flatten()
save_anomalies(lof_anomalies, saving_dir / 'lof_anomalies_cosine_10.csv')

In [6]:
if_clf = IsolationForest(n_estimators=200, contamination=0.0005, max_samples='auto', random_state=42, n_jobs=-1)
if_marks = if_clf.fit_predict(X)

if_anomalies = np.argwhere(if_marks < 0).flatten()
save_anomalies(if_anomalies, saving_dir / 'if_anomalies.csv')

In [7]:
elliptic_clf = EllipticEnvelope(contamination=0.0005, support_fraction=0.7)
elliptic_marks = elliptic_clf.fit_predict(X)

elliptic_anomalies = np.argwhere(elliptic_marks < 0).flatten()
save_anomalies(elliptic_anomalies, saving_dir / 'elliptic_anomalies.csv')