In [9]:
import pandas as pd
# Import required libraries for Random Forest feature selection and XGBoost
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.utils.class_weight import compute_class_weight
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import label_binarize
from sklearn.metrics import (
    confusion_matrix, precision_recall_curve, average_precision_score,
    precision_score, recall_score, f1_score, matthews_corrcoef, classification_report,
    accuracy_score, make_scorer
)
import time


In [10]:
# Load and standardize datasets
from sklearn.preprocessing import StandardScaler

# Load the datasets
df_pe_header = pd.read_csv('D:\\AI-code\\Personal-AI-Projects\\PEMalwareClassification\\Dataset\\PE_Header.csv')
df_pe_section = pd.read_csv('D:\\AI-code\\Personal-AI-Projects\\PEMalwareClassification\\Dataset\\PE_Section.csv')
df_dlls_import = pd.read_csv('D:\\AI-code\\Personal-AI-Projects\\PEMalwareClassification\\Dataset\\DLLs_Imported.csv')
#df_api_call = pd.read_csv('D:\\AI-code\\Personal-AI-Projects\\PEMalwareClassification\\Dataset\\API_Functions.csv')

print("Original shapes:")
print(f"  PE_Header: {df_pe_header.shape}")
print(f"  PE_Section: {df_pe_section.shape}")
print(f"  DLLs_Imported: {df_dlls_import.shape}")
#print(f"  API_Functions: {df_api_call.shape}")

# Standardize PE_Header (excluding 'Type' and 'SHA256' columns)
scaler_header = StandardScaler()
header_cols_to_standardize = [col for col in df_pe_header.columns if col not in ['Type', 'SHA256']]
df_pe_header[header_cols_to_standardize] = scaler_header.fit_transform(df_pe_header[header_cols_to_standardize])

# Standardize PE_Section (excluding 'Type' and 'SHA256' columns)
scaler_section = StandardScaler()
section_cols_to_standardize = [col for col in df_pe_section.columns if col not in ['Type', 'SHA256']]
df_pe_section[section_cols_to_standardize] = scaler_section.fit_transform(df_pe_section[section_cols_to_standardize])

print("\n✓ PE_Header standardized (mean=0, std=1)")
print("✓ PE_Section standardized (mean=0, std=1)")
print("✓ DLLs_Imported and API_Functions contain binary values (0 or 1) - no standardization needed")

# Create unified list of dataframes for merging
dfs = [df_pe_header, df_pe_section, df_dlls_import] #df_api_call]


Original shapes:
  PE_Header: (29807, 54)
  PE_Section: (29760, 92)
  DLLs_Imported: (29498, 631)

✓ PE_Header standardized (mean=0, std=1)
✓ PE_Section standardized (mean=0, std=1)
✓ DLLs_Imported and API_Functions contain binary values (0 or 1) - no standardization needed

✓ PE_Header standardized (mean=0, std=1)
✓ PE_Section standardized (mean=0, std=1)
✓ DLLs_Imported and API_Functions contain binary values (0 or 1) - no standardization needed


In [11]:
#append all columns (except Type and SHA256) dfs[2] and dfs[1] to dfs[0] and make sure SHA256 matches
for df in dfs[1:]:
    dfs[0] = dfs[0].merge(df.drop(columns=['Type']), on='SHA256', how='left')
main_df = dfs[0]


In [12]:
# Explore the data structure
print("Dataset shape:", main_df.shape)
print("\nColumns:", main_df.columns.tolist())
print("\nData types:")
print(main_df.dtypes.value_counts())

# Identify the target column (should be 'class' for wine dataset)
target_col = 'class' if 'class' in main_df.columns else 'Type'
print(f"\nTarget variable distribution (column: {target_col}):")
print(main_df[target_col].value_counts())
print("\nFirst few rows:")
display(main_df.head())

# Check for missing values
print(f"\nMissing values per column:")
missing_values = main_df.isnull().sum()
print(f"Total columns with missing values: {(missing_values > 0).sum()}")
print(f"Total missing values: {missing_values.sum()}")

# Show columns with most missing values (top 10)
if missing_values.sum() > 0:
    print("\nTop 10 columns with missing values:")
    print(missing_values[missing_values > 0].sort_values(ascending=False).head(10))

# Class distribution percentage plot on bar chart
class_distribution = main_df[target_col].value_counts(normalize=True) * 100
fig = go.Figure(data=[go.Bar(x=class_distribution.index, y=class_distribution.values)])
fig.update_layout(title='Class Distribution (%)', xaxis_title='Class', yaxis_title='Percentage')
fig.show()

# Separate features and target (exclude SHA256, target column, and 'class' if it exists)
cols_to_drop = [target_col, 'SHA256']
if 'class' in main_df.columns and 'class' != target_col:
    cols_to_drop.append('class')
    
X = main_df.drop(columns=cols_to_drop)
y = main_df[target_col]

print(f"\nDropped columns: {cols_to_drop}")
print(f"Original data shape: {X.shape}")
print(f"Missing values in X: {X.isnull().sum().sum()}")

# Handle missing values - fill with 0 (appropriate for malware features)
X_clean = X.fillna(0)
print(f"Missing values after cleaning: {X_clean.isnull().sum().sum()}")

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"Encoded classes: {list(label_encoder.classes_)}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Dataset shape: (29810, 773)

Columns: ['SHA256', 'Type', 'e_magic', 'e_cblp', 'e_cp', 'e_crlc', 'e_cparhdr', 'e_minalloc', 'e_maxalloc', 'e_ss', 'e_sp', 'e_csum', 'e_ip', 'e_cs', 'e_lfarlc', 'e_ovno', 'e_oemid', 'e_oeminfo', 'e_lfanew', 'Machine', 'NumberOfSections', 'TimeDateStamp', 'PointerToSymbolTable', 'NumberOfSymbols', 'SizeOfOptionalHeader', 'Characteristics', 'Magic', 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode', 'SizeOfInitializedData', 'SizeOfUninitializedData', 'AddressOfEntryPoint', 'BaseOfCode', 'ImageBase', 'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion', 'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion', 'MajorSubsystemVersion', 'MinorSubsystemVersion', 'Reserved1', 'SizeOfImage', 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics', 'SizeOfStackReserve', 'SizeOfHeapReserve', 'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'text_Misc_VirtualSize', 'text_VirtualAddress', 'text_SizeOfRawData', 'text_

Unnamed: 0,SHA256,Type,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,...,odbccp32.dll,api-ms-win-crt-environment-l1-1-0.dll,api-ms-win-core-memory-l1-1-3.dll,api-ms-win-core-datetime-l1-1-0.dll,api-ms-win-core-psapi-ansi-l1-1-0.dll,api-ms-win-core-fibers-l1-1-0.dll,api-ms-win-core-file-l2-1-0.dll,api-ms-win-core-sysinfo-l1-2-0.dll,dbgeng.dll,d3d11.dll
0,dacbe8cb72dd746539792a50e84965fefef73feaa07b5d...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,d3dc7512ce75db33b2c3063fa99245e9ca9fe3b086462f...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,b350fac81533f02981dc2176ed17163177d92d9405758e...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dfee618043a47b7b09305df0ca460559d9f567ee246c7b...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,c7b2e4e4fb2fcc44c953673ff57c3d14bdf5d2008f35e9...,0,0.0,-0.070347,-0.114082,-0.110971,-0.030865,-0.113599,0.116206,-0.109909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Missing values per column:
Total columns with missing values: 719
Total missing values: 223247

Top 10 columns with missing values:
scnpst32.dll                                        313
api-ms-win-core-timezone-private-l1-1-0.dll         313
dwmapi.dll                                          313
wbemcomn.dll                                        313
api-ms-win-core-registry-l2-2-0.dll                 313
api-ms-win-core-shlwapi-obsolete-l1-2-0.dll         313
api-ms-win-core-localization-obsolete-l1-3-0.dll    313
api-ms-win-core-version-l1-1-0.dll                  313
mswsock.dll                                         313
api-ms-win-core-file-l2-1-1.dll                     313
dtype: int64



Dropped columns: ['Type', 'SHA256']
Original data shape: (29810, 771)
Missing values in X: 223247
Missing values after cleaning: 0
Encoded classes: [0, 1, 2, 3, 4, 5, 6]
Missing values after cleaning: 0
Encoded classes: [0, 1, 2, 3, 4, 5, 6]
Training data shape: (23848, 771)
Testing data shape: (5962, 771)
Training data shape: (23848, 771)
Testing data shape: (5962, 771)


In [13]:



print("class weight balancing...")
print(f"Target class distribution:")
unique_classes, counts = np.unique(y_train, return_counts=True)
for cls, count in zip(unique_classes, counts):
    print(f"  Class {cls} ({label_encoder.classes_[cls]}): {count} samples ({count/len(y_train)*100:.1f}%)")

# Compute per-class weights (balanced)
classes = np.unique(y_train)
print(f"Unique classes: {classes}")
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight = dict(zip(classes, cw))

print(f"\nComputed class weights:")
for cls, weight in class_weight.items():
    print(f"  Class {cls} ({label_encoder.classes_[cls]}): {weight:.3f}")

# Create sample weights for training data
sample_weight_train = np.array([class_weight[label] for label in y_train])

print(f"\nSample weight statistics:")
print(f"  Min weight: {sample_weight_train.min():.3f}")
print(f"  Max weight: {sample_weight_train.max():.3f}")
print(f"  Mean weight: {sample_weight_train.mean():.3f}")
num_classes = len(classes)

class weight balancing...
Target class distribution:
  Class 0 (0): 1502 samples (6.3%)
  Class 1 (1): 4038 samples (16.9%)
  Class 2 (2): 3891 samples (16.3%)
  Class 3 (3): 3978 samples (16.7%)
  Class 4 (4): 4083 samples (17.1%)
  Class 5 (5): 3391 samples (14.2%)
  Class 6 (6): 2965 samples (12.4%)
Unique classes: [0 1 2 3 4 5 6]

Computed class weights:
  Class 0 (0): 2.268
  Class 1 (1): 0.844
  Class 2 (2): 0.876
  Class 3 (3): 0.856
  Class 4 (4): 0.834
  Class 5 (5): 1.005
  Class 6 (6): 1.149

Sample weight statistics:
  Min weight: 0.834
  Max weight: 2.268
  Mean weight: 1.000


In [None]:
# XGBoost for Feature Selection
print("Training XGBoost for feature importance...")

# Train XGBoost with optimal parameters for feature selection
xgb_feature_selector = xgb.XGBClassifier(
    objective='multi:softprob',  # For multiclass probability output
    num_class=num_classes,
    eval_metric='mlogloss',
    n_estimators=200,
    max_depth=12,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=1,
    reg_lambda=0.3,
    reg_alpha=0.3,
    random_state=42,
    n_jobs=-1,
    #early_stopping_rounds=50,
    verbosity=1
)

# Handle class imbalance for feature selection
classes = np.unique(y_train)
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight = dict(zip(classes, cw))
sample_weight_train = np.array([class_weight[label] for label in y_train])

xgb_feature_selector.fit(X_train, y_train, sample_weight=sample_weight_train)

# Get feature importances
feature_names = X.columns.tolist()
feature_importances = xgb_feature_selector.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)
print("\nFeature importances obtained from XGBoost.")
display(feature_importance_df)

# Create a single combined plot with individual and cumulative importance
# Calculate cumulative importance
cumsum_importance = np.cumsum(feature_importance_df['importance'].values)

# Create figure with single y-axis
fig = go.Figure()

# Add bar chart for individual feature importances
fig.add_trace(
    go.Bar(
        x=feature_importance_df['feature'].values,
        y=feature_importance_df['importance'].values,
        name='Individual feature importance',
        marker=dict(color='lightblue', line=dict(color='darkblue', width=0.5)),
        opacity=0.7,
        text=[f'{val:.4f}' for val in feature_importance_df['importance'].values],
        textposition='inside',
        textfont=dict(size=10, color='black')
    )
)

# Add line chart for cumulative importance
fig.add_trace(
    go.Scatter(
        x=feature_importance_df['feature'].values,
        y=cumsum_importance,
        mode='lines+markers',
        name='Cumulative feature importance',
        line=dict(color='red', width=3),
        marker=dict(size=6, color='darkred')
    )
)

# Update axes labels
fig.update_xaxes(title_text="Feature name (ranked by importance)")
fig.update_yaxes(title_text="Feature importance")

# Update layout
fig.update_layout(
    title_text="XGBoost Feature Importance Analysis - Individual and Cumulative",
    height=600,
    width=900,
    hovermode='x unified',
    legend=dict(
        orientation="v",
        yanchor="middle",
        y=0.5,
        xanchor="right",
        x=0.99
    )
)

fig.show()

# Print statistics about feature importance
print(f"\nTotal number of features: {len(feature_importance_df)}")
print("\nCumulative importance by number of features:")
for i in range(0, 1000, 50):
    if i <= len(feature_importance_df):
        cumsum = feature_importance_df.head(i)['importance'].sum()
        print(f"  Top {i:2d} features contribute: {cumsum:.4f} ({cumsum*100:.1f}%)")


Training XGBoost for feature importance...

Feature importances obtained from XGBoost.

Feature importances obtained from XGBoost.


Unnamed: 0,feature,importance
284,ncrypt.dll,0.195167
504,msvbvm60.dll,0.076267
685,mfc42.dll,0.075722
398,msvcp90.dll,0.052532
81,bss_SizeOfRawData,0.041053
...,...,...
362,api-ms-win-core-string-obsolete-l1-1-0.dll,0.000000
363,api-ms-win-core-kernel32-legacy-l1-1-1.dll,0.000000
365,ninput.dll,0.000000
366,api-ms-win-core-registry-l1-1-1.dll,0.000000



Total number of features: 771

Cumulative importance by number of features:
  Top  0 features contribute: 0.0000 (0.0%)
  Top 50 features contribute: 0.8445 (84.4%)
  Top 100 features contribute: 0.9389 (93.9%)
  Top 150 features contribute: 0.9870 (98.7%)
  Top 200 features contribute: 1.0000 (100.0%)
  Top 250 features contribute: 1.0000 (100.0%)
  Top 300 features contribute: 1.0000 (100.0%)
  Top 350 features contribute: 1.0000 (100.0%)
  Top 400 features contribute: 1.0000 (100.0%)
  Top 450 features contribute: 1.0000 (100.0%)
  Top 500 features contribute: 1.0000 (100.0%)
  Top 550 features contribute: 1.0000 (100.0%)
  Top 600 features contribute: 1.0000 (100.0%)
  Top 650 features contribute: 1.0000 (100.0%)
  Top 700 features contribute: 1.0000 (100.0%)
  Top 750 features contribute: 1.0000 (100.0%)


In [None]:
# Select top features based on XGBoost importance
# Let's use top n_features
n_features = 100
top_features = feature_importance_df.head(n_features)['feature'].tolist()

print(f"Selected top {n_features} features for XGBoost training")
print("Top 10 selected features:")
for i, feature in enumerate(top_features[:10]):
    importance = feature_importance_df[feature_importance_df['feature'] == feature]['importance'].values[0]
    print(f"{i+1:2d}. {feature}: {importance:.4f}")

# Create reduced feature sets
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

print(f"\nOriginal feature set: {X_train.shape[1]} features")
print(f"Selected feature set: {X_train_selected.shape[1]} features")
print(f"Reduction: {(1 - X_train_selected.shape[1]/X_train.shape[1])*100:.1f}%")

Selected top 100 features for XGBoost training
Top 10 selected features:
 1. ncrypt.dll: 0.1952
 2. msvbvm60.dll: 0.0763
 3. mfc42.dll: 0.0757
 4. msvcp90.dll: 0.0525
 5. bss_SizeOfRawData: 0.0411
 6. e_ovno: 0.0382
 7. MajorOperatingSystemVersion: 0.0361
 8. e_cblp: 0.0188
 9. rdata_Characteristics: 0.0182
10. SectionAlignment: 0.0165

Original feature set: 771 features
Selected feature set: 100 features
Reduction: 87.0%


In [None]:
# Bayesian Optimization for XGBoost Hyperparameters with MCC Optimization
print("="*80)
print("BAYESIAN OPTIMIZATION FOR XGBOOST HYPERPARAMETERS")
print("Optimization Metric: Matthews Correlation Coefficient (MCC)")
print("="*80)



# Define MCC scorer for optimization
mcc_scorer = make_scorer(matthews_corrcoef)

# Define the search space for Bayesian Optimization
search_spaces = {
    'n_estimators': Integer(50, 300),
    'max_depth': Integer(3, 20),
    'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'reg_lambda': Real(0.01, 1.0, prior='log-uniform'),
    'reg_alpha': Real(0.01, 1.0, prior='log-uniform')
}

print("\nSearch Space for Bayesian Optimization:")
for param, space in search_spaces.items():
    print(f"  {param}: {space}")

# Create the base XGBoost model
base_xgb = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=num_classes,
    eval_metric='mlogloss',
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=43,
    n_jobs=-1,
    verbosity=0
)

# NO CROSS-VALIDATION: Single train/validation split for maximum speed
print("\nCross-Validation Strategy: Single Train/Validation Split (NO CV)")
print(f"Number of optimization iterations: 30")
print("⚡ FAST MODE: Each hyperparameter combination evaluated only ONCE")

# Bayesian Optimization with BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=base_xgb,
    search_spaces=search_spaces,
    n_iter=30,  # Number of parameter settings sampled
    cv=2,  # Minimum value (single split) - effectively trains once per hyperparameter set
    scoring=mcc_scorer,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    return_train_score=True
)

print("\nStarting Bayesian Optimization...")
print("This should be much faster without cross-validation...")
start_time = time.time()

# Fit with sample weights
bayes_search.fit(X_train_selected, y_train, sample_weight=sample_weight_train)

optimization_time = time.time() - start_time
print(f"\nOptimization completed in {optimization_time:.2f} seconds ({optimization_time/60:.2f} minutes)")

# Get best parameters and score
best_params = bayes_search.best_params_
best_score = bayes_search.best_score_

print("\n" + "="*80)
print("OPTIMIZATION RESULTS")
print("="*80)
print(f"\nBest MCC Score (single validation): {best_score:.4f}")
print(f"\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

# Get the best estimator
best_model = bayes_search.best_estimator_

print("\n" + "="*80)
print("BEST MODEL SUMMARY")
print("="*80)
print(f"Best XGBoost Model Configuration:")
print(best_model)

# Show top 5 parameter combinations
print("\n" + "="*80)
print("TOP 5 PARAMETER COMBINATIONS")
print("="*80)
cv_results = pd.DataFrame(bayes_search.cv_results_)
top_5 = cv_results.nlargest(5, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
for idx, row in top_5.iterrows():
    print(f"\nRank {int(row['rank_test_score'])}:")
    print(f"  MCC Score: {row['mean_test_score']:.4f} (+/- {row['std_test_score']:.4f})")
    print(f"  Parameters: {row['params']}")





BAYESIAN OPTIMIZATION FOR XGBOOST HYPERPARAMETERS
Optimization Metric: Matthews Correlation Coefficient (MCC)

Search Space for Bayesian Optimization:
  n_estimators: Integer(low=50, high=300, prior='uniform', transform='identity')
  max_depth: Integer(low=3, high=15, prior='uniform', transform='identity')
  learning_rate: Real(low=0.01, high=0.3, prior='log-uniform', transform='identity')
  subsample: Real(low=0.5, high=1.0, prior='uniform', transform='identity')
  colsample_bytree: Real(low=0.5, high=1.0, prior='uniform', transform='identity')
  reg_lambda: Real(low=0.01, high=2.0, prior='log-uniform', transform='identity')
  reg_alpha: Real(low=0.01, high=2.0, prior='log-uniform', transform='identity')

Cross-Validation Strategy: Single Train/Validation Split (NO CV)
Number of optimization iterations: 30
⚡ FAST MODE: Each hyperparameter combination evaluated only ONCE

Starting Bayesian Optimization...
This should be much faster without cross-validation...

Search Space for Bayesia

In [17]:
# Train Best Model and Make Predictions (NO CROSS-VALIDATION)
print("\n" + "="*80)
print("TRAINING BEST MODEL ON FULL TRAINING SET")
print("="*80)

print("\nSkipping cross-validation for speed...")
print("Training directly on full training set with best hyperparameters...")

start_time = time.time()
best_model.fit(X_train_selected, y_train, sample_weight=sample_weight_train)
training_time = time.time() - start_time

print(f"Training completed in {training_time:.2f} seconds")

# Make predictions
print("\nMaking predictions on test set...")
y_pred = best_model.predict(X_test_selected)
y_pred_proba = best_model.predict_proba(X_test_selected)

print("Predictions completed!")

# Quick evaluation on training and test sets
from sklearn.metrics import accuracy_score

train_pred = best_model.predict(X_train_selected)
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, y_pred)
train_mcc = matthews_corrcoef(y_train, train_pred)
test_mcc = matthews_corrcoef(y_test, y_pred)

print("\n" + "="*80)
print("QUICK PERFORMANCE SUMMARY")
print("="*80)
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy:     {test_acc:.4f}")
print(f"Training MCC:      {train_mcc:.4f}")
print(f"Test MCC:          {test_mcc:.4f}")
print("\n(Detailed metrics will be shown in the final evaluation report)")



TRAINING BEST MODEL ON FULL TRAINING SET

Skipping cross-validation for speed...
Training directly on full training set with best hyperparameters...
Training completed in 20.84 seconds

Making predictions on test set...
Training completed in 20.84 seconds

Making predictions on test set...
Predictions completed!
Predictions completed!

QUICK PERFORMANCE SUMMARY
Training Accuracy: 0.9178
Test Accuracy:     0.8774
Training MCC:      0.9046
Test MCC:          0.8571

(Detailed metrics will be shown in the final evaluation report)

QUICK PERFORMANCE SUMMARY
Training Accuracy: 0.9178
Test Accuracy:     0.8774
Training MCC:      0.9046
Test MCC:          0.8571

(Detailed metrics will be shown in the final evaluation report)


In [18]:
# Comprehensive Metrics Calculation for Final Report
print("\n" + "="*80)
print("FINAL MODEL EVALUATION REPORT")
print("="*80)


# 1. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\n1. CONFUSION MATRIX")
print("-"*60)
print(cm)

# 2. Calculate all required metrics
class_names = label_encoder.classes_
class_names_str = [str(name) for name in class_names]
n_classes = len(class_names)

# Binarize the output for multi-class metrics
y_test_binarized = label_binarize(y_test, classes=list(range(n_classes)))

# Per-class metrics
precision_per_class = []
recall_per_class = []
f1_per_class = []
tnr_per_class = []  # True Negative Rate (Specificity)
npv_per_class = []  # Negative Predictive Value
fpr_per_class = []  # False Positive Rate
fnr_per_class = []  # False Negative Rate

print("\n" + "="*80)
print("PER-CLASS METRICS CALCULATION")
print("="*80)

for i in range(n_classes):
    # True Positives, False Positives, True Negatives, False Negatives
    tp = cm[i, i]
    fp = cm[:, i].sum() - tp
    fn = cm[i, :].sum() - tp
    tn = cm.sum() - tp - fp - fn
    
    # Precision (Positive Predictive Value)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    precision_per_class.append(precision)
    
    # Recall (True Positive Rate / Sensitivity)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    recall_per_class.append(recall)
    
    # F1 Score
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    f1_per_class.append(f1)
    
    # True Negative Rate (Specificity)
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
    tnr_per_class.append(tnr)
    
    # Negative Predictive Value
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    npv_per_class.append(npv)
    
    # False Positive Rate
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fpr_per_class.append(fpr)
    
    # False Negative Rate
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fnr_per_class.append(fnr)

# Calculate macro averages
macro_precision = np.mean(precision_per_class)
macro_recall = np.mean(recall_per_class)  # Macro TPR
macro_f1 = np.mean(f1_per_class)
macro_tnr = np.mean(tnr_per_class)
macro_npv = np.mean(npv_per_class)
macro_fpr = np.mean(fpr_per_class)
macro_fnr = np.mean(fnr_per_class)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_value = matthews_corrcoef(y_test, y_pred)

# Calculate Precision-Recall curves and AUC-PR for each class
pr_curves_data = []
pr_auc_scores = []

for i in range(n_classes):
    precision_curve, recall_curve, _ = precision_recall_curve(
        y_test_binarized[:, i], y_pred_proba[:, i]
    )
    pr_auc = average_precision_score(y_test_binarized[:, i], y_pred_proba[:, i])
    pr_auc_scores.append(pr_auc)
    pr_curves_data.append({
        'class': class_names_str[i],
        'precision': precision_curve,
        'recall': recall_curve,
        'auc': pr_auc
    })

# Macro-average AUC-PR
macro_pr_auc = np.mean(pr_auc_scores)

print("\nMetrics calculated successfully!")
print(f"Number of test samples: {len(y_test)}")
print(f"Number of classes: {n_classes}")


FINAL MODEL EVALUATION REPORT

1. CONFUSION MATRIX
------------------------------------------------------------
[[360   1   2   6   4   2   0]
 [  5 951   4  13  27   4   5]
 [  2   3 961   6   1   0   0]
 [ 16   6   3 815  17  75  63]
 [  4  49   0  22 887  22  37]
 [  1   0   0   7  10 593 237]
 [  2   4   0  10   4  57 664]]

PER-CLASS METRICS CALCULATION

Metrics calculated successfully!
Number of test samples: 5962
Number of classes: 7


In [19]:
# FINAL REPORT - Visualizations and Metrics
print("\n" + "="*80)
print("FINAL EVALUATION REPORT")
print("="*80)


# 1. CONFUSION MATRIX HEATMAP
print("\n1. Confusion Matrix")
print("-"*60)
print(cm)

fig_cm = go.Figure(data=go.Heatmap(
    z=cm,
    x=[f'{class_names_str[i]}' for i in range(n_classes)],
    y=[f'{class_names_str[i]}' for i in range(n_classes)],
    colorscale='Blues',
    text=cm,
    texttemplate="%{text}",
    textfont={"size": 14},
    hovertemplate='True: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

fig_cm.update_layout(
    title='Confusion Matrix',
    xaxis_title='Predicted Class',
    yaxis_title='True Class',
    width=600,
    height=500,
    font=dict(size=12)
)

fig_cm.show()

# 2. PRECISION-RECALL CURVES
print("\n2. Precision-Recall Curves")
print("-"*60)

fig_pr = go.Figure()

# Color palette for classes
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'cyan', 'magenta']

# Plot PR curve for each class
for i, pr_data in enumerate(pr_curves_data):
    fig_pr.add_trace(go.Scatter(
        x=pr_data['recall'],
        y=pr_data['precision'],
        mode='lines',
        name=f"{pr_data['class']} (AUC={pr_data['auc']:.3f})",
        line=dict(color=colors[i % len(colors)], width=2)
    ))

# Add macro-average line
fig_pr.add_trace(go.Scatter(
    x=[0, 1],
    y=[macro_pr_auc, macro_pr_auc],
    mode='lines',
    name=f'Macro Avg AUC-PR={macro_pr_auc:.3f}',
    line=dict(color='black', width=3, dash='dash')
))

fig_pr.update_layout(
    title='Precision-Recall Curves (All Classes)',
    xaxis_title='Recall',
    yaxis_title='Precision',
    width=900,
    height=600,
    legend=dict(x=0.02, y=0.02),
    xaxis=dict(range=[0, 1]),
    yaxis=dict(range=[0, 1])
)

fig_pr.show()

print(f"Macro-Average AUC-PR: {macro_pr_auc:.4f}")
for i, pr_data in enumerate(pr_curves_data):
    print(f"  {pr_data['class']}: {pr_data['auc']:.4f}")

# 3. FINAL METRICS SUMMARY
print("\n" + "="*80)
print("FINAL METRICS SUMMARY (Macro Averages)")
print("="*80)

metrics_summary = {
    '3. Macro-Average Area Under Precision-Recall Curve': macro_pr_auc,
    '4. Macro TPR (True Positive Rate / Recall / Sensitivity)': macro_recall,
    '5. Macro Precision': macro_precision,
    '6. Macro Positive F1-Score': macro_f1,
    '7. Macro TNR (True Negative Rate / Specificity)': macro_tnr,
    '8. Macro NPV (Negative Predictive Value)': macro_npv,
    '9. Macro FPR (False Positive Rate)': macro_fpr,
    '10. Macro FNR (False Negative Rate)': macro_fnr,
    '11. MCC (Matthews Correlation Coefficient)': mcc_value
}

for idx, (metric_name, value) in enumerate(metrics_summary.items(), 1):
    print(f"{metric_name}: {value:.4f}")

# Create a comprehensive summary table
print("\n" + "="*80)
print("COMPREHENSIVE METRICS TABLE")
print("="*80)
print(f"{'Metric':<50} {'Value':<10}")
print("="*80)
for metric_name, value in metrics_summary.items():
    print(f"{metric_name:<50} {value:<10.4f}")
print("="*80)

# Per-class breakdown (additional information)
print("\n" + "="*80)
print("PER-CLASS METRICS BREAKDOWN")
print("="*80)
print(f"{'Class':<15} {'Precision':<12} {'Recall/TPR':<12} {'F1':<10} {'TNR':<10} {'NPV':<10} {'FPR':<10} {'FNR':<10}")
print("-"*100)
for i in range(n_classes):
    print(f"{class_names_str[i]:<15} {precision_per_class[i]:<12.4f} {recall_per_class[i]:<12.4f} "
          f"{f1_per_class[i]:<10.4f} {tnr_per_class[i]:<10.4f} {npv_per_class[i]:<10.4f} "
          f"{fpr_per_class[i]:<10.4f} {fnr_per_class[i]:<10.4f}")
print("-"*100)
print(f"{'MACRO AVG':<15} {macro_precision:<12.4f} {macro_recall:<12.4f} "
      f"{macro_f1:<10.4f} {macro_tnr:<10.4f} {macro_npv:<10.4f} "
      f"{macro_fpr:<10.4f} {macro_fnr:<10.4f}")
print("="*100)


FINAL EVALUATION REPORT

1. Confusion Matrix
------------------------------------------------------------
[[360   1   2   6   4   2   0]
 [  5 951   4  13  27   4   5]
 [  2   3 961   6   1   0   0]
 [ 16   6   3 815  17  75  63]
 [  4  49   0  22 887  22  37]
 [  1   0   0   7  10 593 237]
 [  2   4   0  10   4  57 664]]



2. Precision-Recall Curves
------------------------------------------------------------


Macro-Average AUC-PR: 0.9355
  0: 0.9851
  1: 0.9834
  2: 0.9987
  3: 0.9400
  4: 0.9669
  5: 0.8566
  6: 0.8175

FINAL METRICS SUMMARY (Macro Averages)
3. Macro-Average Area Under Precision-Recall Curve: 0.9355
4. Macro TPR (True Positive Rate / Recall / Sensitivity): 0.8819
5. Macro Precision: 0.8800
6. Macro Positive F1-Score: 0.8773
7. Macro TNR (True Negative Rate / Specificity): 0.9797
8. Macro NPV (Negative Predictive Value): 0.9795
9. Macro FPR (False Positive Rate): 0.0203
10. Macro FNR (False Negative Rate): 0.1181
11. MCC (Matthews Correlation Coefficient): 0.8571

COMPREHENSIVE METRICS TABLE
Metric                                             Value     
3. Macro-Average Area Under Precision-Recall Curve 0.9355    
4. Macro TPR (True Positive Rate / Recall / Sensitivity) 0.8819    
5. Macro Precision                                 0.8800    
6. Macro Positive F1-Score                         0.8773    
7. Macro TNR (True Negative Rate / Specificity)    0.9797    
8. Macro NP