In [1]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas
import re
import sys
import time
from functools import reduce

# noinspection PyUnresolvedReferences
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import scale
from sklearn.svm import OneClassSVM

dataset_name = "feb18_51"
is_drawing = False

csv_in_path = f"../data/{dataset_name}.csv"
out_path = f"../out-data/{dataset_name}/lof/"

def parse_timediff(timediff):
    h = timediff // 3600
    m = timediff % 3600 // 60
    s = timediff % 60
    return h, m, s

In [2]:
# Load input
methods = pandas.read_csv(csv_in_path, header=0, delimiter='\t', quoting=csv.QUOTE_NONE, error_bad_lines=True,
                          engine='python')

In [6]:
info_columns = np.array(methods.values[:, :2])
print(f"info_columns.shape = {info_columns.shape}")
X = np.array(methods.values[:, 2:], dtype="float16")
print(f"X.shape = {X.shape}")
n_methods = X.shape[0]

has_bad_lines = reduce(lambda a, x: a | x, [np.isnan(row).any() for row in X])
assert not has_bad_lines

del methods

info_columns.shape = (4044790, 2)
X.shape = (4044790, 51)


In [7]:
X = scale(X)

In [9]:
df_info = pandas.DataFrame(info_columns)
df_info.to_csv(f"../data/info_51.csv", header=False, index=False)
df_X = pandas.DataFrame(X)
df_X.to_csv(f"../data/X_51_scaled.csv", header=False, index=False)
del df_info
del df_X

In [32]:
df_X = pandas.read_csv(f"../data/X_51_scaled.csv", header=None, index_col=None)
X = np.array(df_X, dtype="float32")
del df_X

In [33]:
X_pca = PCA(n_components=20).fit_transform(X)
del X

In [14]:
clf_desc = f"lof_pca-20_n-20_cont-0.001"

In [35]:
print(f"{clf_desc}")

clf = LocalOutlierFactor(n_neighbors=20, algorithm='auto', leaf_size=30, \
                         metric='minkowski', p=2, metric_params=None, contamination=0.001, n_jobs=-1)

all_indices = np.arange(0, n_methods)

local_start = time.time()

marks = clf.fit_predict(X_pca)

hours, minutes, seconds = parse_timediff(time.time() - local_start)
print(f"Elapsed time: {hours} h. {minutes} min. {seconds} sec.\n")

lof_pca-20_n-20_cont-0.001
Elapsed time: 2.0 h. 37.0 min. 42.836100816726685 sec.



ValueError: I/O operation on closed file.

In [46]:
neg_lof=np.array(clf.negative_outlier_factor_)
neg_lof.shape

(4044790,)

In [45]:
np.savetxt(f"../out-data/{dataset_name}_lof_custom/negative_o_f_{clf_desc}.txt", neg_lof)

In [3]:
neg_lof_path = f"../out-data/feb18_51/lof/negative_o_f_lof_pca-20_n-20_cont-0.001.txt"
neg_lof = np.loadtxt(neg_lof_path)

In [9]:
marks = neg_lof.copy()
threshold = 0.01
marks[neg_lof <= np.percentile(neg_lof, threshold)] = -1  # The lower, the more abnormal
marks[neg_lof > np.percentile(neg_lof, threshold)] = 1

In [10]:
outlier_indices = np.asarray([mark < 0 for mark in marks])

outliers = methods[outlier_indices]
n_outliers = outliers.shape[0]
n_methods = methods.shape[0]
print(f"Outliers:\t{n_outliers:10}/{n_methods:10}\t{(n_outliers * 100 / n_methods):11.7}%")

Outliers:	       405/   4044790	 0.01001288%


In [15]:
# Save output of this configuration to file
# outliers = methods.values[outlier_indices]
dataframe = pandas.DataFrame(outliers)
dataframe.to_csv(f"{out_path}{clf_desc}_{n_outliers}.csv", header=False, index=False)

In [3]:
ee_info_outliers = pandas.read_csv("../out-data/feb18_51/ee/info_only/ee_cont-0.0001_405.csv", header=None, index_col=None)
ee_info_outliers.tail()

Unnamed: 0,0,1
400,76191,repos/czyczk__zzzz-supermarket/src/main/kotlin...
401,79255,repos/diefferson__indoor-android/app/src/main/...
402,87612,repos/pdahlberg__ark2mail/src/main/kotlin/com/...
403,87975,repos/tkakisu__hajiboot-kotlin/src/main/kotlin...
404,87977,repos/tkakisu__hajiboot-kotlin/src/main/kotlin...


In [4]:
ee_outlier_indices = ee_info_outliers.iloc[:, 0]
ee_outlier_indices = ee_outlier_indices.apply(lambda x: x - 1)
ee_outliers = methods.loc[ee_outlier_indices]
ee_outliers.tail()

Unnamed: 0,id,methodName,sloc,relativeLoc,nodeCount,cstHeight,maxLoopNestingDepth,cyclomaticComplexity,designComplexity,numTypecastExpr,...,numOperationReferences,numThrows,numSafeExpressions,numClassLiterals,numCollectionLiterals,numZeroConstants,numOneConstants,numEmptyStringLiterals,numPlusOperations,avgNumWhenEntries
76190,76191,repos/eugenkiss__kotlinfx/kotlinfx-core/src/ma...,2.0,0.000547,116.0,12.0,0.0,1.0,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79254,79255,repos/TPT-Logisim__LogisimKR-Kotlin/src/com/cb...,7.0,0.145833,81.0,11.0,0.0,2.0,2.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87611,87612,repos/chengXyuan__Projects/app/src/main/java/c...,9.0,0.1125,155.0,15.0,0.0,1.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87974,87975,repos/chengXyuan__Projects/app/src/main/java/c...,14.0,0.318182,237.0,16.0,0.0,2.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87976,87977,repos/chengXyuan__Projects/app/src/main/java/c...,1.0,0.022727,38.0,9.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
ee_outliers.to_csv("../out-data/feb18_51/ee/ee_cont-0.0001_405.csv", header=False, index=False)

In [6]:
ee_outlier_indices.head()

0     5392
1     5395
2    11280
3    11281
4    11282
Name: 0, dtype: int64

In [7]:
ee_info_outliers_2 = pandas.read_csv("../out-data/feb18_51/ee/info_only/ee_cont-5e-05_405.csv", header=None, index_col=None)
ee_outlier_indices_2 = ee_info_outliers_2.iloc[:, 0]
ee_outlier_indices_2 = ee_outlier_indices_2.apply(lambda x: x - 1)
ee_outlier_indices_2.head()

0     5392
1     5395
2    11280
3    11281
4    11282
Name: 0, dtype: int64