In [25]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

In [26]:
# Load the dataset
mal_mem_df = pd.read_csv('data.csv')
mal_mem_df.shape

# Converting data types
mal_mem_df["Class"] = mal_mem_df["Class"].astype("category")

# Check for duplicate rows based on all columns
duplicates = mal_mem_df[mal_mem_df.duplicated(keep=False)]  # keep=False marks all duplicates
print(f"Number of duplicate rows: {len(duplicates)}")


Number of duplicate rows: 1019


In [27]:
duplicates = mal_mem_df[mal_mem_df.duplicated(keep=False)]
print(duplicates.sort_values(by=list(mal_mem_df.columns)))

                                                Category  pslist.nproc  \
8840                                              Benign            39   
18719                                             Benign            39   
25931                                             Benign            39   
1728                                              Benign            39   
6338                                              Benign            39   
...                                                  ...           ...   
40759  Spyware-Gator-1cf7f77c2a5f81862bd1c781c3873ab7...            38   
40760  Spyware-Gator-1cf7f77c2a5f81862bd1c781c3873ab7...            37   
40761  Spyware-Gator-1cf7f77c2a5f81862bd1c781c3873ab7...            37   
40762  Spyware-Gator-1cf7f77c2a5f81862bd1c781c3873ab7...            37   
40763  Spyware-Gator-1cf7f77c2a5f81862bd1c781c3873ab7...            37   

       pslist.nppid  pslist.avg_threads  pslist.nprocs64bit  \
8840             12           12.974359         

In [28]:
# Get all duplicates (including first occurrences)
duplicates = mal_mem_df[mal_mem_df.duplicated(keep=False)]

# Sort to group identical rows together
duplicates_sorted = duplicates.sort_values(by=list(mal_mem_df.columns))
print(duplicates_sorted.head(20))  # Preview the first 20 duplicates

      Category  pslist.nproc  pslist.nppid  pslist.avg_threads  \
8840    Benign            39            12           12.974359   
18719   Benign            39            12           12.974359   
25931   Benign            39            12           12.974359   
1728    Benign            39            12           12.974359   
6338    Benign            39            12           12.974359   
16895   Benign            39            12           12.974359   
17294   Benign            39            12           12.974359   
1502    Benign            39            12           13.307692   
1505    Benign            39            12           13.307692   
7967    Benign            39            12           13.307692   
12688   Benign            39            12           13.307692   
13127   Benign            39            12           13.307692   
16256   Benign            39            12           13.307692   
21942   Benign            39            12           13.307692   
23397   Be

In [29]:
# Remove all but the first occurrence
df = mal_mem_df.copy()
df = df.drop_duplicates()
print(f"Rows removed: {len(mal_mem_df) - len(df)}")

Rows removed: 534


In [30]:
print(mal_mem_df.shape)
df.shape

(58596, 57)


(58062, 57)

In [31]:
# Separate features and target
y = df["Class"]
X = df.drop(columns=["Category", "Class"])

In [32]:
# Split the dataset into training and testing sets, using the usual 80:20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [33]:
# Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42, min_samples_leaf=2, min_samples_split=5)
dt_model.fit(X_train, y_train)
if 1:
  print("Decision Tree Classifier ran successfully")

Decision Tree Classifier ran successfully


In [34]:
# Split the category to capture the first word, which signifies the category type
def category(column):
    return column.split("-")[0] if "-" in column else column

In [35]:
# Split the category to capture the second word, which signifies the category name
def category_name(column):
    return column.split("-")[1] if "-" in column else column

In [36]:
# Create new column category to store mawlare type
# Use .loc to avoid the warning
df["category"] = df["Category"].apply(category)

In [37]:
df["category"].value_counts()

category
Benign        29231
Spyware        9815
Ransomware     9529
Trojan         9487
Name: count, dtype: int64

In [38]:
# Creating a column with the names of the variants
df["category_name"] = df["Category"].apply(category_name)
df["category_name"].value_counts()

category_name
Benign          29231
Transponder      2410
Shade            2128
Ako              2000
180solutions     2000
CWS              2000
Refroso          2000
Scar             2000
Gator            1995
Conti            1988
Emotet           1967
Zeus             1950
Maze             1754
Pysa             1659
Reconyc          1570
TIBS             1410
Name: count, dtype: int64

In [39]:
# Define the label encoder
def label_encoder(column):
    lencode = LabelEncoder().fit(column)
    print(column.name, lencode.classes_)
    return lencode.transform(column)

In [40]:
df.tail(5)

Unnamed: 0,Category,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,...,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,Class,category,category_name
58591,Ransomware-Shade-fa03be3078d1b9840f06745f160eb...,37,15,10.108108,0,215.486487,1453,39.27027,7973,215.486487,...,24,116,0,120,86,0,8,Malware,Ransomware,Shade
58592,Ransomware-Shade-f56687137caf9a67678cde91e4614...,37,14,9.945946,0,190.216216,1347,36.405405,7038,190.216216,...,24,116,0,116,88,0,8,Malware,Ransomware,Shade
58593,Ransomware-Shade-faddeea111a25da4d0888f3044ae9...,38,15,9.842105,0,210.026316,1448,38.105263,7982,215.72973,...,24,116,0,120,88,0,8,Malware,Ransomware,Shade
58594,Ransomware-Shade-f866c086af2e1d8ebaa6f2c863157...,37,15,10.243243,0,215.513513,1452,39.243243,7974,215.513513,...,24,116,0,120,87,0,8,Malware,Ransomware,Shade
58595,Ransomware-Shade-955d9af38346c1755527bd196668e...,38,15,9.868421,0,213.026316,1487,39.131579,8095,213.026316,...,24,116,0,120,86,0,8,Malware,Ransomware,Shade


In [41]:
# Encoding the columns
# Recommended approach using .loc[]
df['category'] = label_encoder(df['category'])
df['category_name'] = label_encoder(df['category_name'])
df['Class'] = label_encoder(df['Class'])

category ['Benign' 'Ransomware' 'Spyware' 'Trojan']
category_name ['180solutions' 'Ako' 'Benign' 'CWS' 'Conti' 'Emotet' 'Gator' 'Maze'
 'Pysa' 'Reconyc' 'Refroso' 'Scar' 'Shade' 'TIBS' 'Transponder' 'Zeus']
Class ['Benign' 'Malware']


In [42]:
# Drop the Category and Class columns
df.drop(["Category", "Class"], axis=1, inplace=True)

In [43]:
df.tail()

Unnamed: 0,pslist.nproc,pslist.nppid,pslist.avg_threads,pslist.nprocs64bit,pslist.avg_handlers,dlllist.ndlls,dlllist.avg_dlls_per_proc,handles.nhandles,handles.avg_handles_per_proc,handles.nport,...,svcscan.fs_drivers,svcscan.process_services,svcscan.shared_process_services,svcscan.interactive_process_services,svcscan.nactive,callbacks.ncallbacks,callbacks.nanonymous,callbacks.ngeneric,category,category_name
58591,37,15,10.108108,0,215.486487,1453,39.27027,7973,215.486487,0,...,26,24,116,0,120,86,0,8,1,12
58592,37,14,9.945946,0,190.216216,1347,36.405405,7038,190.216216,0,...,26,24,116,0,116,88,0,8,1,12
58593,38,15,9.842105,0,210.026316,1448,38.105263,7982,215.72973,0,...,26,24,116,0,120,88,0,8,1,12
58594,37,15,10.243243,0,215.513513,1452,39.243243,7974,215.513513,0,...,26,24,116,0,120,87,0,8,1,12
58595,38,15,9.868421,0,213.026316,1487,39.131579,8095,213.026316,0,...,26,24,116,0,120,86,0,8,1,12


In [44]:
# Separate features and target
X = df.drop("category_name", axis=1)
y = df["category_name"]

In [45]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [46]:
# Split the dataset into training and testing sets, using the usual 80:20 split
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# 1. Initial Split: Separate TEST set (completely untouched)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_scaled, y, 
    test_size=0.2,  # 20% for final test
    stratify=y,      # Preserve class distribution
    random_state=42
)

# 2. Secondary Split: Training and Validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.25,  # 25% of remaining = 20% of total
    stratify=y_train_val,
    random_state=42
)

In [47]:
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

# Initialize the RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

# Resample the dataset
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# Now X_resampled and y_resampled will have a balanced class distribution


In [48]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
import numpy as np

clf1 = LogisticRegression(penalty='l2',C=0.001,solver='saga',random_state=1, max_iter = 1000)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

print('5-fold cross validation:\n')

labels = ['Logistic Regression', 'Random Forest', 'Naive Bayes']

for clf, label in zip([clf1, clf2, clf3], labels):

    scores = model_selection.cross_val_score(clf, X_resampled, y_resampled, cv=5, scoring='roc_auc_ovr')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

5-fold cross validation:

Accuracy: 0.84 (+/- 0.00) [Logistic Regression]
Accuracy: 0.95 (+/- 0.00) [Random Forest]
Accuracy: 0.91 (+/- 0.00) [Naive Bayes]


In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc


# Define your classifiers
log_reg = LogisticRegression(penalty='l2',C=0.001,solver='saga',random_state=1, max_iter=1000)
forest = RandomForestClassifier(random_state=1,n_estimators=200,max_depth=10,class_weight='balanced')
bayes = GaussianNB()
ensemble = VotingClassifier(estimators=[
    ('lr', log_reg),
    ('rf', forest),
    ('gnb', bayes)
], voting='soft')

# List of classifiers and their labels
classifiers = [log_reg, forest, bayes, ensemble]
labels = ['Logistic Regression', 'Random Forest', 'Gaussian Naive Bayes', 'Ensemble']

# Perform cross-validation and print accuracy
for clf, label in zip(classifiers, labels):
    scores = cross_val_score(clf, X_resampled, y_resampled, cv=5, scoring='roc_auc_ovr')
    print(f"Accuracy: {scores.mean():.2f} (+/- {scores.std():.2f}) [{label}]")

# Fit the ensemble model
ensemble.fit(X_resampled, y_resampled)

# Predict probabilities and evaluate
y_pred_probs_ensemble = ensemble.predict_proba(X_train_val)
roc_auc_ensemble = roc_auc_score(y_train_val, y_pred_probs_ensemble, multi_class="ovr")

print(f"ROC AUC for Ensemble (cv dataset): {roc_auc_ensemble:.2f}")

# Fit the ensemble model
ensemble.fit(X_resampled, y_resampled)

# Predict probabilities and evaluate
y_pred_probs_ensemble = ensemble.predict_proba(X_test)
roc_auc_ensemble = roc_auc_score(y_test, y_pred_probs_ensemble, multi_class="ovr")

print(f"ROC AUC for Ensemble: {roc_auc_ensemble:.2f}")




Accuracy: 0.84 (+/- 0.00) [Logistic Regression]
Accuracy: 0.95 (+/- 0.00) [Random Forest]
Accuracy: 0.91 (+/- 0.00) [Gaussian Naive Bayes]
Accuracy: 0.93 (+/- 0.00) [Ensemble]
ROC AUC for Ensemble (cv dataset): 0.97
ROC AUC for Ensemble: 0.97
