In [1]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [5]:
# Read data from npy object
X_tot = np.load('../data/X_tot.npy')
X_train, X_test, y_train, y_test = np.load('../data/X_train.npy'), np.load('../data/X_test.npy'), np.load('../data/y_train.npy'), np.load('../data/y_test.npy')
df_tot = pd.read_pickle('../data/df_tot.pandas')

In [None]:
scatter_matrix(X_tot, c=df_tot['malware'], figsize=(30,20))

In [None]:
pca = PCA(n_components=2)
X_test =  pca.fit(X_train).transform(X_test)
X_train = pca.fit(X_train).transform(X_train)
plt.scatter(X_train[y_train.values.ravel()==False, 0], X_train[y_train.values.ravel()==False, 1], alpha=0.1)
plt.scatter(X_train[y_train.values.ravel()==True, 0], X_train[y_train.values.ravel()==True, 1], alpha=0.1)
f = X_train[y_train.values.ravel()==False, 0].shape[0]
t = X_train[y_train.values.ravel()==True, 0].shape[0]
#plt.scatter(X_train[y_train.values.ravel()==False, 0], np.ones(f), alpha=0.05)
#plt.scatter(X_train[y_train.values.ravel()==True, 0], np.ones(t), alpha=0.05)
plt.xlim([-100, 1000])
plt.ylim([-50, 50])

# Class separability analysis
$S_b = \frac{1}{N}\sum_{i=1}^{c}n_c(\mu_i-\mu)(\mu_i-\mu)^T$\
$S_w = frac{1}{N}\sum_{i=1}^{c}\sum_{j=1}^{n_c}\,(x_{ij}-\mu_i)(x_{ij}-\mu_i)^T$\
Property: $S_w + S_b = C$\
1. Class Scatter Matrices (CSM)\
$J = \frac{tr\{S_b\}}{tr\{S_w\}}$\
J is an unbounded measure. The larger the value
of J the smaller the within class scatter as
compared to the between class scatter.

In [115]:
def computeSb(X, y):
    X_1 = X[y==True, :]
    X_0 = X[y==False, :]
    N_0 = X_0.shape[0]
    N_1 = X_1.shape[0]
    N = X.shape[0]
    mu_1 = np.mean(X_1, axis=0).reshape(-1,1)
    mu_0 = np.mean(X_0, axis=0).reshape(-1,1)
    mu = np.mean(X, axis=0).reshape(-1,1)
    return 1/N * (N_0 * np.dot(mu_0-mu, (mu_0-mu).T) + N_1 * np.dot(mu_1-mu, (mu_1-mu).T))

In [116]:
def computeSw(X, y):
    X_1 = X[y==True, :]
    X_0 = X[y==False, :]
    N_0 = X_0.shape[0]
    N_1 = X_1.shape[0]
    N = X.shape[0]
    return 1/N * (N_0 * np.cov(X_0.T, bias=True) + N_1 * np.cov(X_1.T, bias=True))

In [117]:
Sw = computeSw(X_train,y_train)
Sb = computeSb(X_train,y_train)
J = np.trace(Sb) / np.trace(Sw)

In [118]:
J

0.11538703760378899