In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

In [2]:
# Load the dataset
mal_mem_df = pd.read_csv('data.csv')
mal_mem_df.shape

# Converting data types
mal_mem_df["Class"] = mal_mem_df["Class"].astype("category")

In [3]:
# Separate features and target
y = mal_mem_df["Class"]
X = mal_mem_df.drop(columns=["Category", "Class"])

In [4]:
# Split the dataset into training and testing sets, using the usual 80:20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [5]:
# Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42, min_samples_leaf=2, min_samples_split=5)
dt_model.fit(X_train, y_train)
if 1:
  print("Decision Tree Classifier ran successfully")

Decision Tree Classifier ran successfully


In [6]:
# Split the category to capture the first word, which signifies the category type
def category(column):
    return column.split("-")[0] if "-" in column else column

In [7]:
# Split the category to capture the second word, which signifies the category name
def category_name(column):
    return column.split("-")[1] if "-" in column else column

In [8]:
# Create new column category to store mawlare type
mal_mem_df["category"] = mal_mem_df["Category"].apply(category)

In [9]:
mal_mem_df["category"].value_counts()

category
Benign        29298
Spyware       10020
Ransomware     9791
Trojan         9487
Name: count, dtype: int64

In [10]:
# Creating a column with the names of the variants
mal_mem_df["category_name"] = mal_mem_df["Category"].apply(category_name)
mal_mem_df["category_name"].value_counts()

category_name
Benign          29298
Transponder      2410
Gator            2200
Shade            2128
Ako              2000
180solutions     2000
CWS              2000
Refroso          2000
Scar             2000
Conti            1988
Emotet           1967
Maze             1958
Zeus             1950
Pysa             1717
Reconyc          1570
TIBS             1410
Name: count, dtype: int64

In [11]:
# Define the label encoder
def label_encoder(column):
    lencode = LabelEncoder().fit(column)
    print(column.name, lencode.classes_)
    return lencode.transform(column)

In [12]:
# Encoding the columns
mal_mem_df["category"] = label_encoder(mal_mem_df["category"])
mal_mem_df["category_name"] = label_encoder(mal_mem_df["category_name"])
mal_mem_df["class"] = label_encoder(mal_mem_df["Class"])

category ['Benign' 'Ransomware' 'Spyware' 'Trojan']
category_name ['180solutions' 'Ako' 'Benign' 'CWS' 'Conti' 'Emotet' 'Gator' 'Maze'
 'Pysa' 'Reconyc' 'Refroso' 'Scar' 'Shade' 'TIBS' 'Transponder' 'Zeus']
Class ['Benign' 'Malware']


In [13]:
# Drop the Category and Class columns
mal_mem_df.drop(["Category", "Class"], axis=1, inplace=True)

In [14]:
mal_mem_df.tail()

Unnamed: 0,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,handles.nport,...,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,category,category_name,class
58591,37,15,10.108108,0,215.486487,1453,39.27027,7973,215.486487,0,...,24,116,0,120,86,0,8,1,12,1
58592,37,14,9.945946,0,190.216216,1347,36.405405,7038,190.216216,0,...,24,116,0,116,88,0,8,1,12,1
58593,38,15,9.842105,0,210.026316,1448,38.105263,7982,215.72973,0,...,24,116,0,120,88,0,8,1,12,1
58594,37,15,10.243243,0,215.513513,1452,39.243243,7974,215.513513,0,...,24,116,0,120,87,0,8,1,12,1
58595,38,15,9.868421,0,213.026316,1487,39.131579,8095,213.026316,0,...,24,116,0,120,86,0,8,1,12,1


In [15]:
# Separate features and target
X = mal_mem_df.drop("category_name", axis=1)
y = mal_mem_df["category_name"]

In [16]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [17]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [23]:
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

# Initialize the RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

# Resample the dataset
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# Now X_resampled and y_resampled will have a balanced class distribution


In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
import numpy as np

clf1 = LogisticRegression(penalty='l2',C=0.001,solver='lbfgs',random_state=1, max_iter = 1000)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

print('5-fold cross validation:\n')

labels = ['Logistic Regression', 'Random Forest', 'Naive Bayes']

for clf, label in zip([clf1, clf2, clf3], labels):

    scores = model_selection.cross_val_score(clf, X_resampled, y_resampled, cv=5, scoring='roc_auc_ovr')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

5-fold cross validation:

Accuracy: 0.85 (+/- 0.00) [Logistic Regression]
Accuracy: 0.96 (+/- 0.00) [Random Forest]
Accuracy: 0.91 (+/- 0.00) [Naive Bayes]


In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc


# Define your classifiers
log_reg = LogisticRegression(penalty='l2',C=0.001,solver='lbfgs',random_state=1)
forest = RandomForestClassifier(random_state=1)
bayes = GaussianNB()
ensemble = VotingClassifier(estimators=[
    ('lr', log_reg),
    ('rf', forest),
    ('gnb', bayes)
], voting='soft')

# List of classifiers and their labels
classifiers = [log_reg, forest, bayes, ensemble]
labels = ['Logistic Regression', 'Random Forest', 'Gaussian Naive Bayes', 'Ensemble']

# Perform cross-validation and print accuracy
for clf, label in zip(classifiers, labels):
    scores = cross_val_score(clf, X_resampled, y_resampled, cv=5, scoring='roc_auc_ovr')
    print(f"Accuracy: {scores.mean():.2f} (+/- {scores.std():.2f}) [{label}]")


# Fit the ensemble model
ensemble.fit(X_resampled, y_resampled)

# Predict probabilities and evaluate
y_pred_probs_ensemble = ensemble.predict_proba(X_test)
roc_auc_ensemble = roc_auc_score(y_test, y_pred_probs_ensemble, multi_class="ovr")

print(f"ROC AUC for Ensemble: {roc_auc_ensemble:.2f}")

# Fit the ensemble model
ensemble.fit(X_resampled, y_resampled)

# Predict probabilities and evaluate
y_pred_probs_ensemble = ensemble.predict_proba(X_resampled)
roc_auc_ensemble = roc_auc_score(y_resampled, y_pred_probs_ensemble, multi_class="ovr")

print(f"ROC AUC for Ensemble: {roc_auc_ensemble:.2f}")


# Step 2: Predict probabilities for the positive class (class 1)
y_pred_probs_test = ensemble.predict_proba(X_test) # For binary classification, class 1 is usually the second column

# Step 3: Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs_test)
# Step 5: Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_ensemble:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line representing random guessing
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

Accuracy: 0.85 (+/- 0.00) [Logistic Regression]
Accuracy: 0.96 (+/- 0.00) [Random Forest]
Accuracy: 0.91 (+/- 0.00) [Gaussian Naive Bayes]
Accuracy: 0.95 (+/- 0.00) [Ensemble]
ROC AUC for Ensemble: 0.97
ROC AUC for Ensemble: 0.99


ValueError: multiclass format is not supported