In [1]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas
import re
import sys
import time

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.ensemble import IsolationForest

dataset_name = "feb18_51"

csv_in_path = f"../data/{dataset_name}.csv"
out_path = f"../out-data/{dataset_name}/if/"

info_columns_path = f"../data/info_51.csv"
X_scaled_path = f"../data/X_51_scaled.csv"

if not os.path.exists(out_path):
    os.makedirs(out_path)

def parse_timediff(timediff):
    h = timediff // 3600
    m = timediff % 3600 // 60
    s = timediff % 60
    return h, m, s

In [7]:
# Load input
methods = pandas.read_csv(csv_in_path, header=0, delimiter='\t', quoting=csv.QUOTE_NONE, error_bad_lines=True,
                          engine='python')

In [None]:
# Extract info_columns and X

info_columns = np.array(methods.values[:, :2])
print(f"info_columns.shape = {info_columns.shape}")
X = np.array(methods.values[:, 2:], dtype="float16")
print(f"X.shape = {X.shape}")
n_methods = X.shape[0]

has_bad_lines = reduce(lambda a, x: a | x, [np.isnan(row).any() for row in X])
assert not has_bad_lines

del methods

In [None]:
X = scale(X)

In [None]:
# Save info_columns and scaled X

df_info = pandas.DataFrame(info_columns)
df_info.to_csv(info_columns_path, header=False, index=False)
df_X = pandas.DataFrame(X)
df_X.to_csv(f"../data/X_51_scaled.csv", header=False, index=False)
del df_info
del df_X

In [2]:
# Load scaled X

df_X = pandas.read_csv(X_scaled_path, header=None, index_col=None)
X = np.array(df_X, dtype="float32")
del df_X

In [None]:
# PCA

X_pca = PCA(n_components=20).fit_transform(X)
del X

In [4]:
n_estimators = 200
contamination = 0.0001
clf_desc = f"if_pca-20_n_estimators-{n_estimators}_cont-{contamination}"
print(f"{clf_desc}")

clf = IsolationForest(n_estimators=n_estimators, contamination=contamination, max_samples='auto',
                      random_state=42, n_jobs=-1)

local_start = time.time()

clf.fit(X)

hours, minutes, seconds = parse_timediff(time.time() - local_start)
print(f"Elapsed time: {hours} h. {minutes} min. {seconds} sec.\n")

if_pca-20_n_estimators-200_cont-0.0001
Elapsed time: 0.0 h. 9.0 min. 32.84821391105652 sec.



In [5]:
marks = clf.predict(X)

In [9]:
print(marks.shape)
outlier_indices = np.asarray([mark < 0 for mark in marks])
outliers = methods[outlier_indices]

n_methods = methods.shape[0]
n_outliers = outliers.shape[0]
print(f"Outliers:\t{n_outliers:10}/{n_methods:10}\t{(n_outliers * 100 / n_methods):11.7}%")

(4044790,)
Outliers:	       405/   4044790	 0.01001288%


In [13]:
# Save output of this configuration to file

dataframe = pandas.DataFrame(outliers)
dataframe.to_csv(f"{out_path}{clf_desc}_{n_outliers}.csv", header=False, index=False)

In [14]:
outliers.head()

Unnamed: 0,id,methodName,sloc,relativeLoc,nodeCount,cstHeight,maxLoopNestingDepth,cyclomaticComplexity,designComplexity,numTypecastExpr,...,numOperationReferences,numThrows,numSafeExpressions,numClassLiterals,numCollectionLiterals,numZeroConstants,numOneConstants,numEmptyStringLiterals,numPlusOperations,avgNumWhenEntries
15744,15745,repos/skkinsi__ExReader/app/src/main/java/com/...,148.0,0.986928,3323.0,41.0,2.0,16.0,16.0,0.0,...,51.0,0.0,0.0,0.0,0.0,7.0,8.0,1.0,19.0,0.0
28344,28345,repos/sammy1997__BOSM17/app/src/main/java/com/...,158.0,0.908046,3236.0,59.0,1.0,9.0,6.0,6.0,...,51.0,0.0,0.0,0.0,0.0,13.0,8.0,1.0,0.0,0.0
28346,28347,repos/sammy1997__BOSM17/app/src/main/java/com/...,141.0,0.952703,2792.0,39.0,1.0,11.0,10.0,5.0,...,39.0,0.0,0.0,0.0,0.0,7.0,5.0,0.0,0.0,0.0
28896,28897,repos/filipw01__KotlinTutorial/app/src/main/ja...,164.0,0.816832,3297.0,16.0,1.0,20.0,12.0,2.0,...,51.0,1.0,0.0,0.0,0.0,10.0,22.0,0.0,11.0,3.0
43269,43270,repos/TinJenda__ChatKotlin/app/src/main/java/c...,284.0,0.916149,5211.0,89.0,3.0,1.0,1.0,17.0,...,194.0,0.0,0.0,6.0,0.0,8.0,14.0,9.0,11.0,0.0
