# Load Data

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

df = pd.read_pickle("../data/mini_ember_df.p")

# Simple Pre-process

In [2]:
df.dropna(inplace=True)
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454 entries, 695061 to 251597
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   strings_entropy  454 non-null    float64
 1   num_strings      454 non-null    int64  
 2   file_size        454 non-null    int64  
 3   num_exports      454 non-null    int64  
 4   num_imports      454 non-null    int64  
 5   sizeof_code      454 non-null    int64  
 6   num_sections     454 non-null    int64  
 7   has_debug        454 non-null    int64  
 8   has_signature    454 non-null    int64  
 9   y                454 non-null    int64  
dtypes: float64(1), int64(9)
memory usage: 39.0 KB
None


# Organize Data

In [3]:
X = df.drop("y", axis=1)
y_true = np.array(df["y"].tolist())
y_experiment = y_true.copy()

In [4]:
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(y_experiment.shape[0]) < 0.3
y_experiment[random_unlabeled_points] = -1

In [5]:
data = {
    "y_experiment":y_experiment,
    "X":X,
    "y_true":y_true
}

In [6]:
import pickle

pickle.dump(data, open("../data/example.p", "wb"))

# Predict with RFoT

In [5]:
import warnings;
warnings.filterwarnings("ignore")

In [6]:
from RFoT import RFoT

model = RFoT(
        bin_scale=1,
        min_dimensions=3,
        max_dimensions=8,
        component_purity_tol=1.0,
        rank=2,
        n_estimators=200,
        bin_entry=False,
        clustering="ms",
        decomp="cp_apr_gpu",
        n_jobs=2,
        n_gpus=2
)
y_pred = model.predict(X, y_experiment)

100%|██████████| 200/200 [00:30<00:00,  6.48it/s]


# Results

In [7]:
from sklearn.metrics import f1_score

unknown_indices = np.argwhere(y_experiment == -1).flatten()
did_predict_indices = np.argwhere(y_pred[unknown_indices] != -1).flatten()
abstaining_count = len(np.argwhere(y_pred == -1))
f1 = f1_score(
    y_true[unknown_indices][did_predict_indices],
    y_pred[unknown_indices][did_predict_indices],
    average="weighted",
)

print("------------------------")
print("Num. of Abstaining", abstaining_count)
print("Percent Abstaining", (abstaining_count / len(unknown_indices)) * 100, "%")
print("F1=", f1)

------------------------
Num. of Abstaining 124
Percent Abstaining 89.20863309352518 %
F1= 0.8666666666666667
