In [1]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import sys
import time
from functools import reduce

from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import scale
from sklearn.covariance import EllipticEnvelope

dataset_name = "feb18_51"
is_drawing = False

csv_in_path = f"../data/{dataset_name}.csv"
out_path = f"../out-data/{dataset_name}_ee/"
log_path = f"{out_path}methods.log"

if not os.path.exists(out_path):
    os.makedirs(out_path)

def parse_timediff(timediff):
    h = timediff // 3600
    m = timediff % 3600 // 60
    s = timediff % 60
    return h, m, s

In [None]:
# Load input
methods = pd.read_csv(csv_in_path, header=0, delimiter='\t', quoting=csv.QUOTE_NONE, error_bad_lines=True,
                          engine='python')

In [None]:
info_columns = np.array(methods.values[:, :2])
print(f"info_columns.shape = {info_columns.shape}")
X = np.array(methods.values[:, 2:], dtype="float16")
print(f"X.shape = {X.shape}")
n_methods = X.shape[0]

has_bad_lines = reduce(lambda a, x: a | x, [np.isnan(row).any() for row in X])
assert not has_bad_lines

del methods

In [None]:
X = scale(X)

In [None]:
df_info = pd.DataFrame(info_columns)
df_info.to_csv(f"../data/info_51.csv", header=False, index=False)
df_X = pd.DataFrame(X)
df_X.to_csv(f"../data/X_51_scaled.csv", header=False, index=False)
del df_info
del df_X

In [2]:
df_X = pd.read_csv(f"../data/X_51_scaled.csv", header=None, index_col=None)
X = np.array(df_X, dtype="float32")
del df_X

df_info = pd.read_csv(f"../data/info_51.csv", header=None, index_col=None)
info_columns = np.array(df_info)
del df_info

In [3]:
X_pca = PCA(n_components=20).fit_transform(X)
del X

In [13]:
n_methods = X_pca.shape[0]
contamination = 0.00005
clf_desc = f"ee_cont-{contamination}"
print(f"{clf_desc}")

clf = EllipticEnvelope(contamination=contamination)

all_indices = np.arange(0, n_methods)

local_start = time.time()

clf.fit(X_pca)

hours, minutes, seconds = parse_timediff(time.time() - local_start)
print(f"Elapsed time: {hours} h. {minutes} min. {seconds} sec.\n")

ee_cont-5e-05
Elapsed time: 0.0 h. 21.0 min. 29.905561923980713 sec.



In [14]:
marks = clf.predict(X_pca)

In [None]:
neg_lof=np.array(clf.negative_outlier_factor_)
neg_lof.shape

In [None]:
np.savetxt(f"../out-data/{dataset_name}_lof_custom/negative_o_f_{clf_desc}.txt", neg_lof)

In [None]:
marks = neg_lof.copy()
threshold = 0.02
marks[lof <= np.percentile(lof_y_pred, threshold)] = -1
marks[lof > np.percentile(lof_y_pred, threshold)] = 1

In [15]:
# inlier_indices = np.asarray([mark > 0 for mark in bool_marks])
outlier_indices = np.asarray([mark < 0 for mark in bool_marks])

# info_inliers = info_columns[inlier_indices]
info_outliers = info_columns[outlier_indices]
# n_inliers = info_inliers.shape[0]
n_outliers = info_outliers.shape[0]
# print(f"Inliers:\t{n_inliers:10}/{n_methods:10}\t{(n_inliers * 100 / n_methods):11.7}%")
print(f"Outliers:\t{n_outliers:10}/{n_methods:10}\t{(n_outliers * 100 / n_methods):11.7}%")

Outliers:	       405/   4044790	 0.01001288%


In [10]:
marks

array([1, 1, 1, ..., 1, 1, 1])

In [16]:
# outliers = methods.values[outlier_indices]
dataframe = pd.DataFrame(info_outliers)
dataframe.to_csv(f"{out_path}{clf_desc}_{n_outliers}.csv", header=False, index=False)