In [2]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas
import re
import sys
import time
from functools import reduce

# noinspection PyUnresolvedReferences
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import scale
from sklearn.svm import OneClassSVM

dataset_name = "feb18_51"
is_drawing = False

csv_in_path = f"../data/{dataset_name}.csv"
out_path = f"../out-data/{dataset_name}_lof_custom/"
log_path = f"{out_path}methods.log"

if not os.path.exists(out_path):
    os.makedirs(out_path)
log_file = open(log_path, mode='w+')


def log(s):
    print(s)
    log_file.write(s)
    log_file.write('\n')


def parse_timediff(timediff):
    h = timediff // 3600
    m = timediff % 3600 // 60
    s = timediff % 60
    return h, m, s

In [3]:
# Load input
methods = pandas.read_csv(csv_in_path, header=0, delimiter='\t', quoting=csv.QUOTE_NONE, error_bad_lines=True,
                          engine='python')

In [6]:
info_columns = np.array(methods.values[:, :2])
print(f"info_columns.shape = {info_columns.shape}")
X = np.array(methods.values[:, 2:], dtype="float16")
print(f"X.shape = {X.shape}")
n_methods = X.shape[0]

has_bad_lines = reduce(lambda a, x: a | x, [np.isnan(row).any() for row in X])
assert not has_bad_lines

del methods

info_columns.shape = (4044790, 2)
X.shape = (4044790, 51)


In [7]:
X = scale(X)

In [9]:
df_info = pandas.DataFrame(info_columns)
df_info.to_csv(f"../data/info_51.csv", header=False, index=False)
df_X = pandas.DataFrame(X)
df_X.to_csv(f"../data/X_51_scaled.csv", header=False, index=False)
del df_info
del df_X

In [32]:
df_X = pandas.read_csv(f"../data/X_51_scaled.csv", header=None, index_col=None)
X = np.array(df_X, dtype="float32")
del df_X

In [33]:
X_pca = PCA(n_components=20).fit_transform(X)
del X

In [35]:
clf_desc = f"{clf_name}_pca-20_n-20_cont-0.001"
print(f"{clf_desc}")

clf = LocalOutlierFactor(n_neighbors=20, algorithm='auto', leaf_size=30, \
                         metric='minkowski', p=2, metric_params=None, contamination=0.001, n_jobs=-1)

all_indices = np.arange(0, n_methods)

local_start = time.time()

marks = clf.fit_predict(X_pca)

hours, minutes, seconds = parse_timediff(time.time() - local_start)
print(f"Elapsed time: {hours} h. {minutes} min. {seconds} sec.\n")

lof_pca-20_n-20_cont-0.001
Elapsed time: 2.0 h. 37.0 min. 42.836100816726685 sec.



ValueError: I/O operation on closed file.

In [46]:
neg_lof=np.array(clf.negative_outlier_factor_)
neg_lof.shape

(4044790,)

In [45]:
np.savetxt(f"../out-data/{dataset_name}_lof_custom/negative_o_f_{clf_desc}.txt", neg_lof)

In [52]:
marks = neg_lof.copy()
threshold = 0.02
marks[lof <= np.percentile(lof_y_pred, threshold)] = -1
marks[lof > np.percentile(lof_y_pred, threshold)] = 1

In [53]:
# noinspection PyUnboundLocalVariable
inlier_indices = np.asarray([mark > 0 for mark in marks])
outlier_indices = np.asarray([mark < 0 for mark in marks])

info_inliers = info_columns[inlier_indices]
info_outliers = info_columns[outlier_indices]
n_inliers = info_inliers.shape[0]
n_outliers = info_outliers.shape[0]
print(f"Inliers:\t{n_inliers:10}/{n_methods:10}\t{(n_inliers * 100 / n_methods):11.7}%")
print(f"Outliers:\t{n_outliers:10}/{n_methods:10}\t{(n_outliers * 100 / n_methods):11.7}%")

if n_outliers > n_inliers:
    X_temp = X_inliers
    X_inliers = X_outliers
    X_outliers = X_temp
    print("\tSwapped 'inliers' and 'outliers', because there were more outliers than inliers!")

Inliers:	   4043796/   4044790	   99.97543%
Outliers:	       994/   4044790	 0.02457482%


In [54]:
# Save output of this configuration to file
# outliers = methods.values[outlier_indices]
dataframe = pandas.DataFrame(info_outliers)
dataframe.to_csv(f"{out_path}{clf_desc}_{n_outliers}.csv", header=False, index=False)

log_file.close()