In [185]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [186]:
df = pd.read_csv('./dataset/data_file.csv', sep=',')
df

Unnamed: 0,FileName,md5Hash,Machine,DebugSize,DebugRVA,MajorImageVersion,MajorOSVersion,ExportRVA,ExportSize,IatVRA,MajorLinkerVersion,MinorLinkerVersion,NumberOfSections,SizeOfStackReserve,DllCharacteristics,ResourceSize,BitcoinAddresses,Benign
0,0124e21d-018c-4ce0-92a3-b9e205a76bc0.dll,79755c51e413ed3c6be4635fd729a6e1,332,0,0,0,4,0,0,8192,8,0,3,1048576,34112,672,0,1
1,05c8318f98a5d301d80000009c316005.vertdll.dll,95e19f3657d34a432eada93221b0ea16,34404,84,121728,10,10,126576,4930,0,14,10,8,262144,16864,1024,0,1
2,06054fba-5619-4a86-a861-ffb0464bef5d.dll,85c32641d77a54e19ba8ea4ab305c791,332,0,0,0,4,0,0,8192,8,0,3,1048576,34112,672,0,1
3,075822ac99a5d301660400009c316005.adhapi.dll,62e3b959d982ef534b66f819fe15f085,34404,84,19904,10,10,21312,252,18160,14,10,6,262144,16736,1040,0,1
4,090607dd9ba5d301ca0900009c316005.SensorsNative...,ae38c5f7d313ad0ff3bfb8826476767f,34404,84,97728,10,10,105792,1852,70592,14,10,7,262144,16736,1096,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62480,VirusShare_a43ceb5e5fffc793e0205d15a0606cb0,a43ceb5e5fffc793e0205d15a0606cb0,332,0,0,1,4,0,0,4096,6,0,3,1048576,0,23504,0,0
62481,VirusShare_0190dafc8304111a00fccf57340ea6a4,0190dafc8304111a00fccf57340ea6a4,332,0,0,7,10,0,0,0,7,0,7,1048576,0,15704,0,0
62482,VirusShare_0f3ca55979aaf59158d6b01140696e44,0f3ca55979aaf59158d6b01140696e44,332,0,0,0,4,0,0,404908,2,50,11,1048576,0,2364,0,0
62483,VirusShare_fca5ce35f1690db6babca5aa5d559535,fca5ce35f1690db6babca5aa5d559535,332,0,0,0,4,14448,70,4096,8,0,4,1048576,0,130296,0,0


In [187]:
header_names = ['FileName', 'md5Hash', 'Machine', 'DebugSize', 'DebugRVA', 'MajorImageVersion', 'MajorOSVersion', 
                'ExportRVA', 'ExportSize', 'IatVRA', 'MajorLinkerVersion', 'MinorLinkerVersion', 'NumberOfSections', 
                'SizeOfStackReserve', 'DllCharacteristics', 'ResourceSize', 'BitcoinAddresses', 'Benign']

categorical_columns = ['Machine', 'MajorImageVersion', 'MajorOSVersion', 'MajorLinkerVersion', 'MinorLinkerVersion', 'DllCharacteristics', 'NumberOfSections']
df[categorical_columns] = df[categorical_columns].astype('category')

binary_columns = ['Benign', 'BitcoinAddresses']

numeric_columns = df.select_dtypes(include=['int64']).columns.difference(binary_columns)

## Step 3: Data Preprocessing and Data Processing

Based on our data exploration, we observed that the two variables, 'FileName' and 'md5Hash', have unique values for each object, and therefore, we remove them from the dataset beforhand.

In [188]:
df.drop(columns=['FileName', 'md5Hash'], inplace=True)

Based on our understanding of the meaning of 'MinorLinkerVersion' and 'MajorLinkerVersion', we combine them into a single feature called 'LinkerVersion'.

In [189]:
df['LinkerVersion'] = df['MajorLinkerVersion'].astype(str) + '.' + df['MinorLinkerVersion'].astype(str)
df['LinkerVersion'] = df['LinkerVersion'].astype('category')
df.drop(columns=['MajorLinkerVersion', 'MinorLinkerVersion'], inplace=True)

categorical_columns = [col for col in categorical_columns if col not in ['MajorLinkerVersion', 'MinorLinkerVersion']]
categorical_columns.append('LinkerVersion')

As we've seen from data exploration, we try to remove 'BitcoinAddresses'

In [190]:
df.drop(columns=['BitcoinAddresses'], inplace=True)

In [191]:
df.head()

Unnamed: 0,Machine,DebugSize,DebugRVA,MajorImageVersion,MajorOSVersion,ExportRVA,ExportSize,IatVRA,NumberOfSections,SizeOfStackReserve,DllCharacteristics,ResourceSize,Benign,LinkerVersion
0,332,0,0,0,4,0,0,8192,3,1048576,34112,672,1,8.0
1,34404,84,121728,10,10,126576,4930,0,8,262144,16864,1024,1,14.1
2,332,0,0,0,4,0,0,8192,3,1048576,34112,672,1,8.0
3,34404,84,19904,10,10,21312,252,18160,6,262144,16736,1040,1,14.1
4,34404,84,97728,10,10,105792,1852,70592,7,262144,16736,1096,1,14.1


### Test 1: Consider only numerical variables (LogisticRegression)

In [193]:
X = df[numeric_columns]
y = df['Benign']

preprocessor = ColumnTransformer(
    transformers=[
        ('num_scaler', StandardScaler(), numeric_columns)  # Applica il RobustScaler alle variabili numeriche
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),         
    ('classifier', LogisticRegression())     
])

results = cross_validate(pipeline, X, y, 
                         scoring = {
                             'accuracy': make_scorer(accuracy_score),
                             'precision': make_scorer(precision_score),
                             'recall': make_scorer(recall_score)},
                         return_estimator=True, cv=10, n_jobs=-1)

print(f"Minimum accuracy: {results['test_accuracy'].min()}")
print(f"Maximum accuracy: {results['test_accuracy'].max()}")
print(f"Mean accuracy: {results['test_accuracy'].mean()}")
print(f"Minimum precision: {results['test_precision'].min()}")
print(f"Maximum precision: {results['test_precision'].max()}")
print(f"Mean precision: {results['test_precision'].mean()}")
print(f"Mean recall: {results['test_recall'].mean()}")


Minimum accuracy: 0.7173495518565941
Maximum accuracy: 0.8938860435339309
Mean accuracy: 0.79580936022747
Minimum precision: 0.7593201754385965
Maximum precision: 0.9053840063341251
Mean precision: 0.8622142072679845
Mean recall: 0.6253489280534129


### Test 2: Consider only categorical variables (LogisticRegression)

In [195]:
X = df[categorical_columns]
y = df['Benign']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat_transform', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

results = cross_validate(pipeline, X, y, 
                         scoring = {
                             'accuracy': make_scorer(accuracy_score),
                             'precision': make_scorer(precision_score),
                             'recall': make_scorer(recall_score)},
                         return_estimator=True, cv=10, n_jobs=-1)

print(f"Minimum accuracy: {results['test_accuracy'].min()}")
print(f"Maximum accuracy: {results['test_accuracy'].max()}")
print(f"Mean accuracy: {results['test_accuracy'].mean()}")
print(f"Minimum precision: {results['test_precision'].min()}")
print(f"Maximum precision: {results['test_precision'].max()}")
print(f"Mean precision: {results['test_precision'].mean()}")
print(f"Mean recall: {results['test_recall'].mean()}")

Minimum accuracy: 0.9788732394366197
Maximum accuracy: 0.991678668586974
Mean accuracy: 0.9859486045296058
Minimum precision: 0.9775606225117626
Maximum precision: 0.9855126300148589
Mean precision: 0.9821866367836634
Mean recall: 0.9855073398119101


### Test 3: Consider the entire dataset (LogisticRegression)

In [196]:
X = df.drop(columns=['Benign'])
y = df['Benign']

preprocessor = ColumnTransformer(
    transformers=[
        ('num_transform', Pipeline(steps=[
            ('scaler', StandardScaler())  # Normalizzazione
        ]), numeric_columns),
        ('cat_transform', Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))  # OneHotEncoding
        ]), categorical_columns)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=1000))),  # Feature Selection (scegliamo tutte le features, puoi modificare k se vuoi)
    ('classifier', LogisticRegression(max_iter=1000))  # Logistic Regression
])

# Cross-validation
results = cross_validate(pipeline, X, y, 
                         scoring = {
                             'accuracy': make_scorer(accuracy_score),
                             'precision': make_scorer(precision_score),
                             'recall': make_scorer(recall_score)},
                         return_estimator=True, cv=10, n_jobs=-1)

print(f"Minimum accuracy: {results['test_accuracy'].min()}")
print(f"Maximum accuracy: {results['test_accuracy'].max()}")
print(f"Mean accuracy: {results['test_accuracy'].mean()}")
print(f"Minimum precision: {results['test_precision'].min()}")
print(f"Maximum precision: {results['test_precision'].max()}")
print(f"Mean precision: {results['test_precision'].mean()}")
print(f"Mean recall: {results['test_recall'].mean()}")

Minimum accuracy: 0.9822343149807938
Maximum accuracy: 0.9919987197951672
Mean accuracy: 0.9863807274464811
Minimum precision: 0.9779145546705286
Maximum precision: 0.9855180096546602
Mean precision: 0.9822738548084246
Mean recall: 0.986429454348013


### Test 4: Consider only categorical features (RandomForest)

In [203]:
X = df[categorical_columns]
y = df['Benign']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat_transform', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

results = cross_validate(pipeline, X, y, 
                         scoring = {
                             'accuracy': make_scorer(accuracy_score),
                             'precision': make_scorer(precision_score),
                             'recall': make_scorer(recall_score),
                             'f1score': make_scorer(f1_score)},
                         return_estimator=True, cv=10, n_jobs=-1)

print(f"Minimum accuracy: {results['test_accuracy'].min()}")
print(f"Maximum accuracy: {results['test_accuracy'].max()}")
print(f"Mean accuracy: {results['test_accuracy'].mean()}")
print(f"Minimum precision: {results['test_precision'].min()}")
print(f"Maximum precision: {results['test_precision'].max()}")
print(f"Mean precision: {results['test_precision'].mean()}")
print(f"Mean recall: {results['test_recall'].mean()}")
print(f"Mean f1-score: {results['test_f1score'].mean()}")

Minimum accuracy: 0.9871979516722675
Maximum accuracy: 0.9942390782525204
Mean accuracy: 0.9906057875790217
Minimum precision: 0.9839181286549707
Maximum precision: 0.9918729220539343
Mean precision: 0.9876229774821983
Mean recall: 0.9907811804632933
Mean f1-score: 0.9891909985361462


### Test 5: Consider entire dataset (Random Forest)

In [201]:
X = df.drop(columns=['Benign'])
y = df['Benign']

preprocessor = ColumnTransformer(
    transformers=[
        ('num_transform', Pipeline(steps=[
            ('scaler', StandardScaler())  # Normalizzazione
        ]), numeric_columns),
        ('cat_transform', Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))  # OneHotEncoding
        ]), categorical_columns)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(estimator=RandomForestClassifier())),  # Feature Selection (scegliamo tutte le features, puoi modificare k se vuoi)
    ('classifier', RandomForestClassifier())  # Logistic Regression
])

# Cross-validation
results = cross_validate(pipeline, X, y, 
                         scoring = {
                             'accuracy': make_scorer(accuracy_score),
                             'precision': make_scorer(precision_score),
                             'recall': make_scorer(recall_score),
                             'f1score': make_scorer(f1_score)},
                         return_estimator=True, cv=10, n_jobs=-1)

print(f"Minimum accuracy: {results['test_accuracy'].min()}")
print(f"Maximum accuracy: {results['test_accuracy'].max()}")
print(f"Mean accuracy: {results['test_accuracy'].mean()}")
print(f"Minimum precision: {results['test_precision'].min()}")
print(f"Maximum precision: {results['test_precision'].max()}")
print(f"Mean precision: {results['test_precision'].mean()}")
print(f"Mean recall: {results['test_recall'].mean()}")
print(f"Mean f1-score: {results['test_f1score'].mean()}")

Minimum accuracy: 0.9947191550648103
Maximum accuracy: 0.9975996159385502
Mean accuracy: 0.9960310576708921
Minimum precision: 0.9952047215049797
Maximum precision: 0.9981529368304396
Mean precision: 0.9969674714501278
Mean recall: 0.9938786344065311
Mean f1-score: 0.9954196679741119


### Test 6: Consider only numerical features (Random Forest)

In [199]:
X = df[numeric_columns]
y = df['Benign']

preprocessor = ColumnTransformer(
    transformers=[
        ('num_scaler', StandardScaler(), numeric_columns)  # Applica il RobustScaler alle variabili numeriche
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),         
    ('classifier', RandomForestClassifier())     
])

results = cross_validate(pipeline, X, y, 
                         scoring = {
                             'accuracy': make_scorer(accuracy_score),
                             'precision': make_scorer(precision_score),
                             'recall': make_scorer(recall_score)},
                         return_estimator=True, cv=10, n_jobs=-1)

print(f"Minimum accuracy: {results['test_accuracy'].min()}")
print(f"Maximum accuracy: {results['test_accuracy'].max()}")
print(f"Mean accuracy: {results['test_accuracy'].mean()}")
print(f"Minimum precision: {results['test_precision'].min()}")
print(f"Maximum precision: {results['test_precision'].max()}")
print(f"Mean precision: {results['test_precision'].mean()}")
print(f"Mean recall: {results['test_recall'].mean()}")

Minimum accuracy: 0.9764724711907811
Maximum accuracy: 0.9923175416133163
Mean accuracy: 0.9851804739462541
Minimum precision: 0.9837735849056604
Maximum precision: 0.9910044977511244
Mean precision: 0.9870972848382398
Mean recall: 0.9786483342745441
