In [1]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import data_import
from sklearn.cluster import KMeans
from data_preprocessing import KMeansDimensionalityReduction

Load data from the `npy` objects 

In [2]:
# Import data from Contagio dataset
X_tot, df_tot, X_train, X_test, y_train, y_test = data_import.import_data_train_test('Contagio')

Visualize the scatter matrix to see which features are correlated:

In [3]:
#scatter_matrix(df_tot, c=df_tot['malware'], figsize=(30,20))

## Dimensionality reduction using MAIORCA approach

In [4]:
KMDimRed = KMeansDimensionalityReduction(df_tot)
df_tot_km, X_tot_km, X_train_km = KMDimRed.fit_ben_mal_kmeans(df_tot, X_tot, X_train)
X_test_km = KMDimRed.transform_ben_mal_kmeans(X_test)

KNN using original features:

In [5]:
from sklearn.neighbors import KNeighborsClassifier
import model_evaluator
knn_clf = KNeighborsClassifier(n_neighbors=1)
knn_clf.fit(X_train, y_train)
y_test_predicted = knn_clf.predict(X_test).reshape(-1,1)
print('Accuracy on testing: %f%%' % (model_evaluator.compute_accuracy(y_test, y_test_predicted)*100))
y_train_predicted = knn_clf.predict(X_train).reshape(-1,1)
print('Accuracy on training: %f%%' % (model_evaluator.compute_accuracy(y_train, y_train_predicted)*100))

Accuracy on testing: 99.678377%
Accuracy on training: 99.993815%


KNN using kmeans extracted features BEN or MAL:

In [6]:
from sklearn.neighbors import KNeighborsClassifier
import model_evaluator
knn_clf = KNeighborsClassifier(n_neighbors=1)
knn_clf.fit(X_train_km, y_train)
y_test_predicted = knn_clf.predict(X_test_km).reshape(-1,1)
print('Accuracy on testing: %f%%' % (model_evaluator.compute_accuracy(y_test, y_test_predicted)*100))
y_train_predicted = knn_clf.predict(X_train_km).reshape(-1,1)
print('Accuracy on training: %f%%' % (model_evaluator.compute_accuracy(y_train, y_train_predicted)*100))

Accuracy on testing: 99.703117%
Accuracy on training: 99.907218%


In [8]:
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go
pca = PCA(n_components=2)
X_test =  pca.fit(X_train).transform(X_test)
X_train = pca.fit(X_train).transform(X_train)
fig1 = px.scatter(x=X_train[y_train.values.ravel()==False, 0], y=X_train[y_train.values.ravel()==False, 1])
fig1.update_traces(marker=dict(color='green'))
fig2 = px.scatter(x=X_train[y_train.values.ravel()==True, 0], y=X_train[y_train.values.ravel()==True, 1])
fig2.update_traces(marker=dict(color='red'))
f = X_train[y_train.values.ravel()==False, 0].shape[0]
t = X_train[y_train.values.ravel()==True, 0].shape[0]
#plt.scatter(X_train[y_train.values.ravel()==False, 0], np.ones(f), alpha=0.05)
#plt.scatter(X_train[y_train.values.ravel()==True, 0], np.ones(t), alpha=0.05)
#px.xlim([-75, 0])
#px.ylim([-50, 50])
fig3 = go.Figure(data=fig1.data + fig2.data)
fig3.show()

AttributeError: 'numpy.ndarray' object has no attribute 'values'

# Class separability analysis
$S_b = \frac{1}{N}\sum_{i=1}^{c}n_c(\mu_i-\mu)(\mu_i-\mu)^T$\
$S_w = frac{1}{N}\sum_{i=1}^{c}\sum_{j=1}^{n_c}\,(x_{ij}-\mu_i)(x_{ij}-\mu_i)^T$\
Property: $S_w + S_b = C$\
1. Class Scatter Matrices (CSM)\
$J = \frac{tr\{S_b\}}{tr\{S_w\}}$\
J is an unbounded measure. The larger the value
of J the smaller the within class scatter as
compared to the between class scatter.

In [None]:
def computeSb(X, y):
    X_1 = X[y==True, :]
    X_0 = X[y==False, :]
    N_0 = X_0.shape[0]
    N_1 = X_1.shape[0]
    N = X.shape[0]
    mu_1 = np.mean(X_1, axis=0).reshape(-1,1)
    mu_0 = np.mean(X_0, axis=0).reshape(-1,1)
    mu = np.mean(X, axis=0).reshape(-1,1)
    return 1/N * (N_0 * np.dot(mu_0-mu, (mu_0-mu).T) + N_1 * np.dot(mu_1-mu, (mu_1-mu).T))

In [None]:
def computeSw(X, y):
    X_1 = X[y==True, :]
    X_0 = X[y==False, :]
    N_0 = X_0.shape[0]
    N_1 = X_1.shape[0]
    N = X.shape[0]
    return 1/N * (N_0 * np.cov(X_0.T, bias=True) + N_1 * np.cov(X_1.T, bias=True))

In [None]:
Sw = computeSw(X_train,y_train)
Sb = computeSb(X_train,y_train)
J = np.trace(Sb) / np.trace(Sw)

In [None]:
J

0.11538703760378899