In [1]:
import pandas as pd
# Import required libraries for Random Forest feature selection and XGBoost
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
import nbformat
from sklearn.utils.class_weight import compute_class_weight
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import label_binarize
from sklearn.metrics import (
    confusion_matrix, precision_recall_curve, average_precision_score,
    precision_score, recall_score, f1_score, matthews_corrcoef, classification_report,
    accuracy_score, make_scorer
)
import time
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import joblib

In [2]:


dataset_dir = Path('.') / 'Dataset'
# Use pathlib Path objects with pandas (pandas accepts path-like objects)
df_pe_header = pd.read_csv(dataset_dir / 'pe_header.csv')
df_pe_section = pd.read_csv(dataset_dir / 'pe_section.csv')
df_dlls_import = pd.read_csv(dataset_dir / 'dlls_imported.csv')
df_api_call = pd.read_csv(dataset_dir / 'top_500_api_functions.csv')

print("Original shapes:")
print(f"  PE_Header: {df_pe_header.shape}")
print(f"  PE_Section: {df_pe_section.shape}")
print(f"  DLLs_Imported: {df_dlls_import.shape}")
print(f"  API_Functions: {df_api_call.shape}")

# Standardize PE_Header (excluding 'Type' and 'SHA256' columns)
scaler_header = StandardScaler()
header_cols_to_standardize = [col for col in df_pe_header.columns if col not in ['Type', 'SHA256']]
df_pe_header[header_cols_to_standardize] = scaler_header.fit_transform(df_pe_header[header_cols_to_standardize])

# Standardize PE_Section (excluding 'Type' and 'SHA256' columns)
scaler_section = StandardScaler()
section_cols_to_standardize = [col for col in df_pe_section.columns if col not in ['Type', 'SHA256']]
df_pe_section[section_cols_to_standardize] = scaler_section.fit_transform(df_pe_section[section_cols_to_standardize])

print("\n✓ PE_Header standardized (mean=0, std=1)")
print("✓ PE_Section standardized (mean=0, std=1)")
print("✓ DLLs_Imported and API_Functions contain binary values (0 or 1) - no standardization needed")

# Create unified list of dataframes for merging
dfs = [df_pe_header, df_pe_section, df_dlls_import, df_api_call]


Original shapes:
  PE_Header: (29807, 54)
  PE_Section: (29760, 92)
  DLLs_Imported: (29498, 631)
  API_Functions: (29505, 502)

✓ PE_Header standardized (mean=0, std=1)
✓ PE_Section standardized (mean=0, std=1)
✓ DLLs_Imported and API_Functions contain binary values (0 or 1) - no standardization needed

✓ PE_Header standardized (mean=0, std=1)
✓ PE_Section standardized (mean=0, std=1)
✓ DLLs_Imported and API_Functions contain binary values (0 or 1) - no standardization needed


In [3]:
#append all columns (except Type and SHA256) dfs[2] and dfs[1] to dfs[0] and make sure SHA256 matches
for df in dfs[1:]:
    dfs[0] = dfs[0].merge(df.drop(columns=['Type']), on='SHA256', how='inner')
main_df = dfs[0]
display(main_df.head())



Unnamed: 0,SHA256,Type,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,...,releasemutex,copyfileexw,getprivateprofilesectionw,setclassword,getconsolescreenbufferinfo,getsavefilenamew,vbavar2vec,setdefaultcommconfigw,invertrect,gdipcloneimage
0,dacbe8cb72dd746539792a50e84965fefef73feaa07b5d...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0,0,0,0,0,0,0,0,0,0
1,d3dc7512ce75db33b2c3063fa99245e9ca9fe3b086462f...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0,0,0,0,0,0,0,0,0,0
2,b350fac81533f02981dc2176ed17163177d92d9405758e...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0,0,0,0,0,0,0,0,0,0
3,dfee618043a47b7b09305df0ca460559d9f567ee246c7b...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0,0,0,0,0,0,0,0,0,0
4,c7b2e4e4fb2fcc44c953673ff57c3d14bdf5d2008f35e9...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#write main_df to csv
main_df.to_csv(dataset_dir / 'merged_data.csv', index=False)

In [5]:
# Explore the data structure
print("Dataset shape:", main_df.shape)
print("\nColumns:", main_df.columns.tolist())
print("\nData types:")
print(main_df.dtypes.value_counts())

# Identify the target column (should be 'class' for wine dataset)
target_col = 'class' if 'class' in main_df.columns else 'Type'
print(f"\nTarget variable distribution (column: {target_col}):")
print(main_df[target_col].value_counts())
print("\nFirst few rows:")
display(main_df.head())

# Check for missing values
print(f"\nMissing values per column:")
missing_values = main_df.isnull().sum()
print(f"Total columns with missing values: {(missing_values > 0).sum()}")
print(f"Total missing values: {missing_values.sum()}")

# Show columns with most missing values (top 10)
if missing_values.sum() > 0:
    print("\nTop 10 columns with missing values:")
    print(missing_values[missing_values > 0].sort_values(ascending=False).head(10))

# Class distribution percentage plot on bar chart
class_distribution = main_df[target_col].value_counts(normalize=True) * 100
fig = go.Figure(data=[go.Bar(x=class_distribution.index, y=class_distribution.values)])
fig.update_layout(title='Class Distribution (%)', xaxis_title='Class', yaxis_title='Percentage')
fig.show()

# Separate features and target (exclude SHA256, target column, and 'class' if it exists)
cols_to_drop = [target_col, 'SHA256']
if 'class' in main_df.columns and 'class' != target_col:
    cols_to_drop.append('class')
    
X = main_df.drop(columns=cols_to_drop)
y = main_df[target_col]

print(f"\nDropped columns: {cols_to_drop}")
print(f"Original data shape: {X.shape}")
print(f"Missing values in X: {X.isnull().sum().sum()}")

# Handle missing values - fill with 0 (appropriate for malware features)
X_clean = X.fillna(0)
print(f"Missing values after cleaning: {X_clean.isnull().sum().sum()}")

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"Encoded classes: {list(label_encoder.classes_)}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Dataset shape: (29499, 1273)

Columns: ['SHA256', 'Type', 'e_magic', 'e_cblp', 'e_cp', 'e_crlc', 'e_cparhdr', 'e_minalloc', 'e_maxalloc', 'e_ss', 'e_sp', 'e_csum', 'e_ip', 'e_cs', 'e_lfarlc', 'e_ovno', 'e_oemid', 'e_oeminfo', 'e_lfanew', 'Machine', 'NumberOfSections', 'TimeDateStamp', 'PointerToSymbolTable', 'NumberOfSymbols', 'SizeOfOptionalHeader', 'Characteristics', 'Magic', 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode', 'SizeOfInitializedData', 'SizeOfUninitializedData', 'AddressOfEntryPoint', 'BaseOfCode', 'ImageBase', 'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion', 'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion', 'MajorSubsystemVersion', 'MinorSubsystemVersion', 'Reserved1', 'SizeOfImage', 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics', 'SizeOfStackReserve', 'SizeOfHeapReserve', 'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'text_Misc_VirtualSize', 'text_VirtualAddress', 'text_SizeOfRawData', 'text

Unnamed: 0,SHA256,Type,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,...,releasemutex,copyfileexw,getprivateprofilesectionw,setclassword,getconsolescreenbufferinfo,getsavefilenamew,vbavar2vec,setdefaultcommconfigw,invertrect,gdipcloneimage
0,dacbe8cb72dd746539792a50e84965fefef73feaa07b5d...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0,0,0,0,0,0,0,0,0,0
1,d3dc7512ce75db33b2c3063fa99245e9ca9fe3b086462f...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0,0,0,0,0,0,0,0,0,0
2,b350fac81533f02981dc2176ed17163177d92d9405758e...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0,0,0,0,0,0,0,0,0,0
3,dfee618043a47b7b09305df0ca460559d9f567ee246c7b...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0,0,0,0,0,0,0,0,0,0
4,c7b2e4e4fb2fcc44c953673ff57c3d14bdf5d2008f35e9...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0,0,0,0,0,0,0,0,0,0



Missing values per column:
Total columns with missing values: 0
Total missing values: 0



Dropped columns: ['Type', 'SHA256']
Original data shape: (29499, 1271)
Missing values in X: 0
Missing values after cleaning: 0
Encoded classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]
Training data shape: (23599, 1271)
Testing data shape: (5900, 1271)
Training data shape: (23599, 1271)
Testing data shape: (5900, 1271)


In [6]:



print("class weight balancing...")
print(f"Target class distribution:")
unique_classes, counts = np.unique(y_train, return_counts=True)
for cls, count in zip(unique_classes, counts):
    print(f"  Class {cls} ({label_encoder.classes_[cls]}): {count} samples ({count/len(y_train)*100:.1f}%)")

# Compute per-class weights (balanced)
classes = np.unique(y_train)
print(f"Unique classes: {classes}")
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight = dict(zip(classes, cw))

print(f"\nComputed class weights:")
for cls, weight in class_weight.items():
    print(f"  Class {cls} ({label_encoder.classes_[cls]}): {weight:.3f}")

# Create sample weights for training data
sample_weight_train = np.array([class_weight[label] for label in y_train])

print(f"\nSample weight statistics:")
print(f"  Min weight: {sample_weight_train.min():.3f}")
print(f"  Max weight: {sample_weight_train.max():.3f}")
print(f"  Mean weight: {sample_weight_train.mean():.3f}")
num_classes = len(classes)

class weight balancing...
Target class distribution:
  Class 0 (0): 1502 samples (6.4%)
  Class 1 (1): 4017 samples (17.0%)
  Class 2 (2): 3714 samples (15.7%)
  Class 3 (3): 3964 samples (16.8%)
  Class 4 (4): 4061 samples (17.2%)
  Class 5 (5): 3382 samples (14.3%)
  Class 6 (6): 2959 samples (12.5%)
Unique classes: [0 1 2 3 4 5 6]

Computed class weights:
  Class 0 (0): 2.245
  Class 1 (1): 0.839
  Class 2 (2): 0.908
  Class 3 (3): 0.850
  Class 4 (4): 0.830
  Class 5 (5): 0.997
  Class 6 (6): 1.139

Sample weight statistics:
  Min weight: 0.830
  Max weight: 2.245
  Mean weight: 1.000


In [7]:
# XGBoost for Feature Selection
print("Training XGBoost for feature importance...")

# Train XGBoost with optimal parameters for feature selection
xgb_feature_selector = xgb.XGBClassifier(
    objective='multi:softprob',  # For multiclass probability output
    num_class=num_classes,
    eval_metric='mlogloss',
    n_estimators=200,
    max_depth=12,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=1,
    reg_lambda=0.3,
    reg_alpha=0.3,
    random_state=999,
    n_jobs=-1,
    #early_stopping_rounds=50,
    verbosity=1
)

# Handle class imbalance for feature selection
classes = np.unique(y_train)
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight = dict(zip(classes, cw))
sample_weight_train = np.array([class_weight[label] for label in y_train])

xgb_feature_selector.fit(X_train, y_train, sample_weight=sample_weight_train)

# Get feature importances
feature_names = X.columns.tolist()
feature_importances = xgb_feature_selector.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)
print("\nFeature importances obtained from XGBoost.")
display(feature_importance_df)

# Create a single combined plot with individual and cumulative importance
# Calculate cumulative importance
cumsum_importance = np.cumsum(feature_importance_df['importance'].values)

# Create figure with single y-axis
fig = go.Figure()

# Add bar chart for individual feature importances
fig.add_trace(
    go.Bar(
        x=feature_importance_df['feature'].values,
        y=feature_importance_df['importance'].values,
        name='Individual feature importance',
        marker=dict(color='lightblue', line=dict(color='darkblue', width=0.5)),
        opacity=0.7,
        text=[f'{val:.4f}' for val in feature_importance_df['importance'].values],
        textposition='inside',
        textfont=dict(size=10, color='black')
    )
)

# Add line chart for cumulative importance
fig.add_trace(
    go.Scatter(
        x=feature_importance_df['feature'].values,
        y=cumsum_importance,
        mode='lines+markers',
        name='Cumulative feature importance',
        line=dict(color='red', width=3),
        marker=dict(size=6, color='darkred')
    )
)

# Update axes labels
fig.update_xaxes(title_text="Feature name (ranked by importance)")
fig.update_yaxes(title_text="Feature importance")

# Update layout
fig.update_layout(
    title_text="XGBoost Feature Importance Analysis - Individual and Cumulative",
    height=600,
    width=900,
    hovermode='x unified',
    legend=dict(
        orientation="v",
        yanchor="middle",
        y=0.5,
        xanchor="right",
        x=0.99
    )
)

fig.show()

# Print statistics about feature importance
print(f"\nTotal number of features: {len(feature_importance_df)}")
print("\nCumulative importance by number of features:")
for i in range(0, 1273, 50):
    if i <= len(feature_importance_df):
        cumsum = feature_importance_df.head(i)['importance'].sum()
        print(f"  Top {i:2d} features contribute: {cumsum:.4f} ({cumsum*100:.1f}%)")


Training XGBoost for feature importance...

Feature importances obtained from XGBoost.

Feature importances obtained from XGBoost.


Unnamed: 0,feature,importance
775,eventsinkaddref,0.152113
773,getmessagea,0.062509
939,exit,0.055885
793,strxfrm,0.055753
779,cryptacquirecontexta,0.053193
...,...,...
1226,getwindowextex,0.000000
1232,strcmpi,0.000000
546,api-ms-win-core-timezone-private-l1-1-0.dll,0.000000
1269,invertrect,0.000000



Total number of features: 1271

Cumulative importance by number of features:
  Top  0 features contribute: 0.0000 (0.0%)
  Top 50 features contribute: 0.7890 (78.9%)
  Top 100 features contribute: 0.8571 (85.7%)
  Top 150 features contribute: 0.8980 (89.8%)
  Top 200 features contribute: 0.9259 (92.6%)
  Top 250 features contribute: 0.9459 (94.6%)
  Top 300 features contribute: 0.9612 (96.1%)
  Top 350 features contribute: 0.9732 (97.3%)
  Top 400 features contribute: 0.9826 (98.3%)
  Top 450 features contribute: 0.9901 (99.0%)
  Top 500 features contribute: 0.9959 (99.6%)
  Top 550 features contribute: 0.9995 (99.9%)
  Top 600 features contribute: 1.0000 (100.0%)
  Top 650 features contribute: 1.0000 (100.0%)
  Top 700 features contribute: 1.0000 (100.0%)
  Top 750 features contribute: 1.0000 (100.0%)
  Top 800 features contribute: 1.0000 (100.0%)
  Top 850 features contribute: 1.0000 (100.0%)
  Top 900 features contribute: 1.0000 (100.0%)
  Top 950 features contribute: 1.0000 (100.0

In [8]:
# Select top features based on XGBoost importance
# Let's use top n_features
n_features = 1000
top_features = feature_importance_df.head(n_features)['feature'].tolist()

print(f"Selected top {n_features} features for XGBoost training")

# Create reduced feature sets
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

print(f"\nOriginal feature set: {X_train.shape[1]} features")
print(f"Selected feature set: {X_train_selected.shape[1]} features")
print(f"Reduction: {(1 - X_train_selected.shape[1]/X_train.shape[1])*100:.1f}%")

Selected top 1000 features for XGBoost training

Original feature set: 1271 features
Selected feature set: 1000 features
Reduction: 21.3%

Original feature set: 1271 features
Selected feature set: 1000 features
Reduction: 21.3%


In [None]:
# Bayesian Optimization for Random Forest Hyperparameters with MCC Optimization
print("="*80)
print("BAYESIAN OPTIMIZATION FOR RANDOM FOREST HYPERPARAMETERS")
print("Optimization Metric: Matthews Correlation Coefficient (MCC)")
print("="*80)

from sklearn.ensemble import RandomForestClassifier

# Define MCC scorer for optimization
mcc_scorer = make_scorer(matthews_corrcoef)

# Define the search space for Bayesian Optimization
# Random Forest hyperparameters optimized for large datasets
search_spaces = {
    'n_estimators': Integer(100, 500),           # Number of trees in the forest
    'max_depth': Integer(10, 50),                # Maximum depth of the trees
    'min_samples_split': Integer(2, 20),         # Minimum samples required to split an internal node
    'min_samples_leaf': Integer(1, 10),          # Minimum samples required to be at a leaf node
    'max_features': Categorical(['sqrt', 'log2', None]),  # Number of features to consider for best split
}

print("\nSearch Space for Bayesian Optimization:")
for param, space in search_spaces.items():
    print(f"  {param}: {space}")

# Create the base Random Forest model
base_rf = RandomForestClassifier(
    class_weight='balanced_subsample',
    random_state=999,
    n_jobs=-1,
    verbose=0
)

print("\nNote: Random Forest is an ensemble of decision trees")

print(f"\nCross-Validation Strategy: 2-Fold Stratified CV")
print(f"Number of optimization iterations: 30")

# Bayesian Optimization with BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=base_rf,
    search_spaces=search_spaces,
    n_iter=30,  # More iterations for complex hyperparameter space
    cv=2,
    scoring=mcc_scorer,
    n_jobs=-1,
    verbose=2,
    random_state=999,
    return_train_score=True
)

print("\nStarting Bayesian Optimization...")
start_time = time.time()

# Fit with sample weights to handle class imbalance
bayes_search.fit(X_train_selected, y_train, sample_weight=sample_weight_train)

optimization_time = time.time() - start_time
print(f"\nOptimization completed in {optimization_time:.2f} seconds ({optimization_time/60:.2f} minutes)")

# Get best parameters and score
best_params = bayes_search.best_params_
best_score = bayes_search.best_score_

print("\n" + "="*80)
print("OPTIMIZATION RESULTS")
print("="*80)
print(f"\nBest MCC Score (CV): {best_score:.4f}")
print(f"\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

# Get the best estimator
best_model = bayes_search.best_estimator_

print("\n" + "="*80)
print("BEST MODEL SUMMARY")
print("="*80)
print(f"Best Random Forest Model Configuration:")
print(best_model)

# Show top 10 parameter combinations
print("\n" + "="*80)
print("TOP 10 PARAMETER COMBINATIONS")
print("="*80)
cv_results = pd.DataFrame(bayes_search.cv_results_)
top_10 = cv_results.nlargest(10, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
for idx, row in top_10.iterrows():
    print(f"\nRank {int(row['rank_test_score'])}:")
    print(f"  MCC Score: {row['mean_test_score']:.4f} (+/- {row['std_test_score']:.4f})")
    print(f"  Parameters: {row['params']}")

BAYESIAN OPTIMIZATION FOR RANDOM FOREST HYPERPARAMETERS
Optimization Metric: Matthews Correlation Coefficient (MCC)

Search Space for Bayesian Optimization:
  n_estimators: Integer(low=100, high=500, prior='uniform', transform='identity')
  max_depth: Integer(low=10, high=50, prior='uniform', transform='identity')
  min_samples_split: Integer(low=2, high=20, prior='uniform', transform='identity')
  min_samples_leaf: Integer(low=1, high=10, prior='uniform', transform='identity')
  max_features: Categorical(categories=('sqrt', 'log2', None), prior=None)

Note: Random Forest is an ensemble of decision trees

Cross-Validation Strategy: 2-Fold Stratified CV
Number of optimization iterations: 10

Starting Bayesian Optimization...
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 

In [10]:
# 5-Fold Cross-Validation with Best Model
print("\n" + "="*80)
print("5-FOLD CROSS-VALIDATION WITH BEST MODEL")
print("="*80)



# Define custom scoring metrics for cross-validation
scoring_metrics = {
    'mcc': mcc_scorer,
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

print("\nPerforming 5-fold cross-validation with best model...")
# Define 5-fold stratified cross-validation
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=999)

cv_results_best = cross_validate(
    best_model, 
    X_train_selected, 
    y_train,
    cv=cv_strategy,
    scoring=scoring_metrics,
    return_train_score=True,
    n_jobs=-1
)

print("\nCross-Validation Results (5-Fold):")
print("-"*60)
for metric in ['mcc', 'accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
    test_scores = cv_results_best[f'test_{metric}']
    train_scores = cv_results_best[f'train_{metric}']
    print(f"{metric.upper()}:")
    print(f"  Test:  {test_scores.mean():.4f} (+/- {test_scores.std():.4f})")
    print(f"  Train: {train_scores.mean():.4f} (+/- {train_scores.std():.4f})")

# Retrain best model on full training data
print("\n" + "="*80)
print("TRAINING BEST MODEL ON FULL TRAINING SET")
print("="*80)

start_time = time.time()
best_model.fit(X_train_selected, y_train, sample_weight=sample_weight_train)
training_time = time.time() - start_time

print(f"Training completed in {training_time:.2f} seconds")

# Make predictions
y_pred = best_model.predict(X_test_selected)
y_pred_proba = best_model.predict_proba(X_test_selected)

print("Predictions completed!")



5-FOLD CROSS-VALIDATION WITH BEST MODEL

Performing 5-fold cross-validation with best model...

Cross-Validation Results (5-Fold):
------------------------------------------------------------
MCC:
  Test:  0.8576 (+/- 0.0017)
  Train: 0.9571 (+/- 0.0012)
ACCURACY:
  Test:  0.8780 (+/- 0.0014)
  Train: 0.9630 (+/- 0.0011)
PRECISION_MACRO:
  Test:  0.8839 (+/- 0.0015)
  Train: 0.9651 (+/- 0.0009)
RECALL_MACRO:
  Test:  0.8794 (+/- 0.0022)
  Train: 0.9654 (+/- 0.0011)
F1_MACRO:
  Test:  0.8783 (+/- 0.0013)
  Train: 0.9631 (+/- 0.0011)

TRAINING BEST MODEL ON FULL TRAINING SET

Cross-Validation Results (5-Fold):
------------------------------------------------------------
MCC:
  Test:  0.8576 (+/- 0.0017)
  Train: 0.9571 (+/- 0.0012)
ACCURACY:
  Test:  0.8780 (+/- 0.0014)
  Train: 0.9630 (+/- 0.0011)
PRECISION_MACRO:
  Test:  0.8839 (+/- 0.0015)
  Train: 0.9651 (+/- 0.0009)
RECALL_MACRO:
  Test:  0.8794 (+/- 0.0022)
  Train: 0.9654 (+/- 0.0011)
F1_MACRO:
  Test:  0.8783 (+/- 0.0013)
  Tra

In [11]:
# Comprehensive Metrics Calculation for Final Report
print("\n" + "="*80)
print("FINAL MODEL EVALUATION REPORT")
print("="*80)


# 1. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\n1. CONFUSION MATRIX")
print("-"*60)
print(cm)

# 2. Calculate all required metrics
class_names = label_encoder.classes_
class_names_str = [str(name) for name in class_names]
n_classes = len(class_names)

# Binarize the output for multi-class metrics
y_test_binarized = label_binarize(y_test, classes=list(range(n_classes)))

# Per-class metrics
precision_per_class = []
recall_per_class = []
f1_per_class = []
tnr_per_class = []  # True Negative Rate (Specificity)
npv_per_class = []  # Negative Predictive Value
fpr_per_class = []  # False Positive Rate
fnr_per_class = []  # False Negative Rate

print("\n" + "="*80)
print("PER-CLASS METRICS CALCULATION")
print("="*80)

for i in range(n_classes):
    # True Positives, False Positives, True Negatives, False Negatives
    tp = cm[i, i]
    fp = cm[:, i].sum() - tp
    fn = cm[i, :].sum() - tp
    tn = cm.sum() - tp - fp - fn
    
    # Precision (Positive Predictive Value)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    precision_per_class.append(precision)
    
    # Recall (True Positive Rate / Sensitivity)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    recall_per_class.append(recall)
    
    # F1 Score
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    f1_per_class.append(f1)
    
    # True Negative Rate (Specificity)
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
    tnr_per_class.append(tnr)
    
    # Negative Predictive Value
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    npv_per_class.append(npv)
    
    # False Positive Rate
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fpr_per_class.append(fpr)
    
    # False Negative Rate
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fnr_per_class.append(fnr)

# Calculate macro averages
macro_precision = np.mean(precision_per_class)
macro_recall = np.mean(recall_per_class)  # Macro TPR
macro_f1 = np.mean(f1_per_class)
macro_tnr = np.mean(tnr_per_class)
macro_npv = np.mean(npv_per_class)
macro_fpr = np.mean(fpr_per_class)
macro_fnr = np.mean(fnr_per_class)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_value = matthews_corrcoef(y_test, y_pred)

# Calculate Precision-Recall curves and AUC-PR for each class
pr_curves_data = []
pr_auc_scores = []

for i in range(n_classes):
    precision_curve, recall_curve, _ = precision_recall_curve(
        y_test_binarized[:, i], y_pred_proba[:, i]
    )
    pr_auc = average_precision_score(y_test_binarized[:, i], y_pred_proba[:, i])
    pr_auc_scores.append(pr_auc)
    pr_curves_data.append({
        'class': class_names_str[i],
        'precision': precision_curve,
        'recall': recall_curve,
        'auc': pr_auc
    })

# Macro-average AUC-PR
macro_pr_auc = np.mean(pr_auc_scores)

print("\nMetrics calculated successfully!")
print(f"Number of test samples: {len(y_test)}")
print(f"Number of classes: {n_classes}")


FINAL MODEL EVALUATION REPORT

1. CONFUSION MATRIX
------------------------------------------------------------
[[349   1   3  14   1   4   3]
 [  2 954   0   6  31   5   7]
 [  1   5 922   1   0   0   0]
 [ 13   8   5 804  12  70  79]
 [  0  31   4  25 899  13  43]
 [  3   1   4  14   3 583 237]
 [  2   0   0  18   2  28 690]]

PER-CLASS METRICS CALCULATION

Metrics calculated successfully!
Number of test samples: 5900
Number of classes: 7


In [12]:
# FINAL REPORT - Visualizations and Metrics
print("\n" + "="*80)
print("FINAL EVALUATION REPORT")
print("="*80)


# 1. CONFUSION MATRIX HEATMAP
print("\n1. Confusion Matrix")
print("-"*60)
print(cm)

fig_cm = go.Figure(data=go.Heatmap(
    z=cm,
    x=[f'{class_names_str[i]}' for i in range(n_classes)],
    y=[f'{class_names_str[i]}' for i in range(n_classes)],
    colorscale='Blues',
    text=cm,
    texttemplate="%{text}",
    textfont={"size": 14},
    hovertemplate='True: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

fig_cm.update_layout(
    title='Confusion Matrix',
    xaxis_title='Predicted Class',
    yaxis_title='True Class',
    width=600,
    height=500,
    font=dict(size=12)
)

fig_cm.show()

# 2. PRECISION-RECALL CURVES
print("\n2. Precision-Recall Curves")
print("-"*60)

fig_pr = go.Figure()

# Color palette for classes
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'cyan', 'magenta']

# Plot PR curve for each class
for i, pr_data in enumerate(pr_curves_data):
    fig_pr.add_trace(go.Scatter(
        x=pr_data['recall'],
        y=pr_data['precision'],
        mode='lines',
        name=f"{pr_data['class']} (AUC={pr_data['auc']:.3f})",
        line=dict(color=colors[i % len(colors)], width=2)
    ))

# Add macro-average line
fig_pr.add_trace(go.Scatter(
    x=[0, 1],
    y=[macro_pr_auc, macro_pr_auc],
    mode='lines',
    name=f'Macro Avg AUC-PR={macro_pr_auc:.3f}',
    line=dict(color='black', width=3, dash='dash')
))

fig_pr.update_layout(
    title='Precision-Recall Curves (All Classes)',
    xaxis_title='Recall',
    yaxis_title='Precision',
    width=900,
    height=600,
    legend=dict(x=0.02, y=0.02),
    xaxis=dict(range=[0, 1]),
    yaxis=dict(range=[0, 1])
)

fig_pr.show()

print(f"Macro-Average AUC-PR: {macro_pr_auc:.4f}")
for i, pr_data in enumerate(pr_curves_data):
    print(f"  {pr_data['class']}: {pr_data['auc']:.4f}")

# 3. FINAL METRICS SUMMARY
print("\n" + "="*80)
print("FINAL METRICS SUMMARY (Macro Averages)")
print("="*80)

metrics_summary = {
    '3. Macro-Average Area Under Precision-Recall Curve': macro_pr_auc,
    '4. Macro TPR (True Positive Rate / Recall / Sensitivity)': macro_recall,
    '5. Macro Precision': macro_precision,
    '6. Macro Positive F1-Score': macro_f1,
    '7. Macro TNR (True Negative Rate / Specificity)': macro_tnr,
    '8. Macro NPV (Negative Predictive Value)': macro_npv,
    '9. Macro FPR (False Positive Rate)': macro_fpr,
    '10. Macro FNR (False Negative Rate)': macro_fnr,
    '11. MCC (Matthews Correlation Coefficient)': mcc_value
}

for idx, (metric_name, value) in enumerate(metrics_summary.items(), 1):
    print(f"{metric_name}: {value:.4f}")

# Create a comprehensive summary table
print("\n" + "="*80)
print("COMPREHENSIVE METRICS TABLE")
print("="*80)
print(f"{'Metric':<50} {'Value':<10}")
print("="*80)
for metric_name, value in metrics_summary.items():
    print(f"{metric_name:<50} {value:<10.4f}")
print("="*80)

# Per-class breakdown (additional information)
print("\n" + "="*80)
print("PER-CLASS METRICS BREAKDOWN")
print("="*80)
print(f"{'Class':<15} {'Precision':<12} {'Recall/TPR':<12} {'F1':<10} {'TNR':<10} {'NPV':<10} {'FPR':<10} {'FNR':<10}")
print("-"*100)
for i in range(n_classes):
    print(f"{class_names_str[i]:<15} {precision_per_class[i]:<12.4f} {recall_per_class[i]:<12.4f} "
          f"{f1_per_class[i]:<10.4f} {tnr_per_class[i]:<10.4f} {npv_per_class[i]:<10.4f} "
          f"{fpr_per_class[i]:<10.4f} {fnr_per_class[i]:<10.4f}")
print("-"*100)
print(f"{'MACRO AVG':<15} {macro_precision:<12.4f} {macro_recall:<12.4f} "
      f"{macro_f1:<10.4f} {macro_tnr:<10.4f} {macro_npv:<10.4f} "
      f"{macro_fpr:<10.4f} {macro_fnr:<10.4f}")
print("="*100)


FINAL EVALUATION REPORT

1. Confusion Matrix
------------------------------------------------------------
[[349   1   3  14   1   4   3]
 [  2 954   0   6  31   5   7]
 [  1   5 922   1   0   0   0]
 [ 13   8   5 804  12  70  79]
 [  0  31   4  25 899  13  43]
 [  3   1   4  14   3 583 237]
 [  2   0   0  18   2  28 690]]



2. Precision-Recall Curves
------------------------------------------------------------


Macro-Average AUC-PR: 0.9397
  0: 0.9810
  1: 0.9868
  2: 0.9976
  3: 0.9370
  4: 0.9705
  5: 0.8829
  6: 0.8219

FINAL METRICS SUMMARY (Macro Averages)
3. Macro-Average Area Under Precision-Recall Curve: 0.9397
4. Macro TPR (True Positive Rate / Recall / Sensitivity): 0.8845
5. Macro Precision: 0.8887
6. Macro Positive F1-Score: 0.8816
7. Macro TNR (True Negative Rate / Specificity): 0.9803
8. Macro NPV (Negative Predictive Value): 0.9803
9. Macro FPR (False Positive Rate): 0.0197
10. Macro FNR (False Negative Rate): 0.1155
11. MCC (Matthews Correlation Coefficient): 0.8626

COMPREHENSIVE METRICS TABLE
Metric                                             Value     
3. Macro-Average Area Under Precision-Recall Curve 0.9397    
4. Macro TPR (True Positive Rate / Recall / Sensitivity) 0.8845    
5. Macro Precision                                 0.8887    
6. Macro Positive F1-Score                         0.8816    
7. Macro TNR (True Negative Rate / Specificity)    0.9803    
8. Macro NP

In [13]:
# Save the trained model and related components
print("\n" + "="*80)
print("SAVING MODEL AND COMPONENTS")
print("="*80)

import joblib
from pathlib import Path

# Create a directory for saved models
models_dir = Path('.') / 'saved_models' / 'random_forest'
models_dir.mkdir(parents=True, exist_ok=True)

# Save the best model
model_path = models_dir / 'random_forest_best_model.joblib'
joblib.dump(best_model, model_path)
print(f"✓ Model saved to: {model_path}")

# Save the label encoder (needed to decode predictions)
encoder_path = models_dir / 'random_forest_label_encoder.joblib'
joblib.dump(label_encoder, encoder_path)
print(f"✓ Label encoder saved to: {encoder_path}")

# Save the top features list (needed to select correct features from new data)
features_path = models_dir / 'random_forest_top_features.joblib'
joblib.dump(top_features, features_path)
print(f"✓ Top features list saved to: {features_path}")

# Save the scaler objects (needed to standardize new data the same way)
scaler_header_path = models_dir / 'random_forest_scaler_header.joblib'
joblib.dump(scaler_header, scaler_header_path)
print(f"✓ PE Header scaler saved to: {scaler_header_path}")

scaler_section_path = models_dir / 'random_forest_scaler_section.joblib'
joblib.dump(scaler_section, scaler_section_path)
print(f"✓ PE Section scaler saved to: {scaler_section_path}")

# Save metadata about the model
metadata = {
    'n_features': len(top_features),
    'n_classes': n_classes,
    'class_names': list(label_encoder.classes_),
    'best_params': best_params,
    'best_mcc_score': best_score,
    'test_mcc_score': mcc_value,
    'test_macro_f1': macro_f1,
    'test_macro_precision': macro_precision,
    'test_macro_recall': macro_recall
}

metadata_path = models_dir / 'random_forest_metadata.joblib'
joblib.dump(metadata, metadata_path)
print(f"✓ Model metadata saved to: {metadata_path}")

print("\n" + "="*80)
print("MODEL COMPONENTS SAVED SUCCESSFULLY")
print("="*80)
print(f"\nSaved files:")
print(f"  1. {model_path.name} - Trained Random Forest model")
print(f"  2. {encoder_path.name} - Label encoder for class names")
print(f"  3. {features_path.name} - List of {len(top_features)} selected features")
print(f"  4. {scaler_header_path.name} - Scaler for PE Header features")
print(f"  5. {scaler_section_path.name} - Scaler for PE Section features")
print(f"  6. {metadata_path.name} - Model performance and configuration")
print(f"\nAll files saved in: {models_dir.absolute()}")

# Print file sizes
print("\n" + "="*80)
print("FILE SIZES")
print("="*80)
total_size = 0
for file_path in [model_path, encoder_path, features_path, scaler_header_path, scaler_section_path, metadata_path]:
    if file_path.exists():
        size_mb = file_path.stat().st_size / (1024 * 1024)
        total_size += size_mb
        print(f"{file_path.name:<30} {size_mb:>8.2f} MB")

print(f"{'='*30} {total_size:>8.2f} MB")
print(f"\nTotal size of saved model components: {total_size:.2f} MB")


SAVING MODEL AND COMPONENTS
✓ Model saved to: saved_models\random_forest\random_forest_best_model.joblib
✓ Label encoder saved to: saved_models\random_forest\random_forest_label_encoder.joblib
✓ Top features list saved to: saved_models\random_forest\random_forest_top_features.joblib
✓ PE Header scaler saved to: saved_models\random_forest\random_forest_scaler_header.joblib
✓ PE Section scaler saved to: saved_models\random_forest\random_forest_scaler_section.joblib
✓ Model metadata saved to: saved_models\random_forest\random_forest_metadata.joblib

MODEL COMPONENTS SAVED SUCCESSFULLY

Saved files:
  1. random_forest_best_model.joblib - Trained Random Forest model
  2. random_forest_label_encoder.joblib - Label encoder for class names
  3. random_forest_top_features.joblib - List of 1000 selected features
  4. random_forest_scaler_header.joblib - Scaler for PE Header features
  5. random_forest_scaler_section.joblib - Scaler for PE Section features
  6. random_forest_metadata.joblib - Mo

## Loading the Saved Random Forest Model

Below is an example of how to load the saved model and use it for predictions on new data.

In [14]:
# Example: Load the saved Random Forest model and components
import joblib
from pathlib import Path

# Path to saved models
models_dir = Path('.') / 'saved_models' / 'random_forest'

# Load all components
loaded_model = joblib.load(models_dir / 'random_forest_best_model.joblib')
loaded_encoder = joblib.load(models_dir / 'random_forest_label_encoder.joblib')
loaded_features = joblib.load(models_dir / 'random_forest_top_features.joblib')
loaded_scaler_header = joblib.load(models_dir / 'random_forest_scaler_header.joblib')
loaded_scaler_section = joblib.load(models_dir / 'random_forest_scaler_section.joblib')
loaded_metadata = joblib.load(models_dir / 'random_forest_metadata.joblib')

print("Model loaded successfully!")
print(f"\nModel type: {type(loaded_model).__name__}")
print(f"Number of features: {loaded_metadata['n_features']}")
print(f"Number of classes: {loaded_metadata['n_classes']}")
print(f"Class names: {loaded_metadata['class_names']}")
print(f"\nBest hyperparameters: {loaded_metadata['best_params']}")
print(f"Best CV MCC score: {loaded_metadata['best_mcc_score']:.4f}")
print(f"Test MCC score: {loaded_metadata['test_mcc_score']:.4f}")
print(f"Test Macro F1 score: {loaded_metadata['test_macro_f1']:.4f}")

Model loaded successfully!

Model type: RandomForestClassifier
Number of features: 1000
Number of classes: 7
Class names: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]

Best hyperparameters: OrderedDict([('max_depth', 27), ('n_estimators', 342)])
Best CV MCC score: 0.8467
Test MCC score: 0.8626
Test Macro F1 score: 0.8816


In [15]:
# Example: Make predictions on new data using the loaded model
# This demonstrates how to prepare new data and get predictions

# For demonstration, let's use a few samples from the test set
example_indices = [0, 10, 20, 30, 40]  # Just a few examples
example_data = X_test.iloc[example_indices].copy()
example_labels = y_test.iloc[example_indices].copy()

# Make predictions
predictions = loaded_model.predict(example_data)

# Decode the predictions to get class names
predicted_classes = loaded_encoder.inverse_transform(predictions)
true_classes = loaded_encoder.inverse_transform(example_labels)

# Display results
print("Example Predictions:")
print("="*80)
for i, (true_class, pred_class) in enumerate(zip(true_classes, predicted_classes)):
    match = "✓" if true_class == pred_class else "✗"
    print(f"Sample {i+1}: True={true_class:<15} Predicted={pred_class:<15} {match}")

# Get prediction probabilities
proba = loaded_model.predict_proba(example_data)
print("\n" + "="*80)
print("Prediction Probabilities:")
print("="*80)
for i in range(len(example_data)):
    print(f"\nSample {i+1} (True: {true_classes[i]}, Predicted: {predicted_classes[i]}):")
    for j, class_name in enumerate(loaded_encoder.classes_):
        print(f"  {class_name:<15}: {proba[i][j]*100:>6.2f}%")

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'