# CS5830 Project 5: Naive Bayes

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest

# Models
from sklearn.naive_bayes import GaussianNB

In [None]:
# Get datasets from https://whyisyoung.github.io/BODMAS/
bodmas_np = np.load('data/bodmas.npz')
metadata = pd.read_csv('data/bodmas_metadata.csv')
categories = pd.read_csv('data/bodmas_malware_category.csv')

X = pd.DataFrame(bodmas_np['X'])
y = pd.DataFrame(bodmas_np['y'])
X

In [None]:
y = y.rename(columns={0: 'label'})
sns.catplot(x='label', kind='count', data=y)

X = X.loc[:, (X != X.iloc[0]).any()]
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Apply scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Feature selection
kbest = SelectKBest(k=50)
X_train = kbest.fit_transform(X_train, y_train)
X_test = kbest.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
corr = X_train.corr().abs()
sns.heatmap(corr, cmap='coolwarm')

# Remove one of each pair of features with a high correlation as they may indicate feature redundancy
# https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on
threshold = 0.75
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
display(to_drop)

X_train_dropped = X_train.drop(columns=to_drop)
X_test_dropped = X_test.drop(columns=to_drop)

In [None]:
# Gaussian Naive Bayes on best 50 features - binary classification
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
display(precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label=1))

# Now with the dropped highly-correlated features
gnb.fit(X_train_dropped, y_train)
y_pred = gnb.predict(X_test_dropped)
display(precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label=1))

In [None]:
# One step further, see how well we can predict the malware category

y = pd.merge(metadata, categories, left_on='sha', right_on='sha256', how='left')
y = y.drop(columns=['sha', 'sha256', 'timestamp', 'family'])

y = y.replace(np.nan, 'none', regex=True)
display(y)
X_cat = X.iloc[y.index]

# Join categories to X based on sha
X_train, X_test, y_train, y_test = train_test_split(X_cat, y, test_size=0.2)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Apply scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Feature selection
kbest = SelectKBest(k=500)
X_train = kbest.fit_transform(X_train, y_train)
X_test = kbest.transform(X_test)

In [None]:
# Gaussian Naive Bayes on best 500 features - multi-class classification
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
display(precision_recall_fscore_support(y_test, y_pred, average='weighted'))

compare = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
compare['Correct'] = compare['Actual'] == compare['Predicted']
display(compare)