In [1]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('generated_fake_data.csv')

# Convert 'Benign' to binary labels using a threshold
threshold = 1e-30  # From your previous fix
data['Benign'] = (data['Benign'] > threshold).astype(int)

# Define features and target
features = ['Machine', 'DebugSize', 'DebugRVA', 'MajorImageVersion', 'MajorOSVersion',
            'ExportRVA', 'ExportSize', 'IatVRA', 'MajorLinkerVersion', 'MinorLinkerVersion',
            'NumberOfSections', 'SizeOfStackReserve', 'DllCharacteristics', 'ResourceSize',
            'BitcoinAddresses']
target = 'Benign'

# Prepare features (X) and target (y)
X = data[features]
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier with parameters to reduce accuracy
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_estimators=10,            # Fewer trees
    learning_rate=0.5,          # Higher learning rate for faster, less precise convergence
    max_depth=3,                # Shallower trees
    subsample=0.5,              # Use only 50% of training data per tree
    colsample_bytree=0.5,       # Use only 50% of features per tree
    reg_lambda=10,              # Stronger L2 regularization
    scale_pos_weight=5          # Bias toward minority class to reduce accuracy
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = xgb_model.feature_importances_
for feature, importance in zip(features, feature_importance):
    print(f"{feature}: {importance:.4f}")

# Save the model
xgb_model.save_model('xgboost_model_reduced.json')

Accuracy: 0.9950

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        33
           1       1.00      0.99      1.00       167

    accuracy                           0.99       200
   macro avg       0.99      1.00      0.99       200
weighted avg       1.00      0.99      1.00       200

Machine: 0.0112
DebugSize: 0.0833
DebugRVA: 0.0000
MajorImageVersion: 0.0319
MajorOSVersion: 0.1109
ExportRVA: 0.0071
ExportSize: 0.0297
IatVRA: 0.0258
MajorLinkerVersion: 0.0167
MinorLinkerVersion: 0.0161
NumberOfSections: 0.0000
SizeOfStackReserve: 0.0563
DllCharacteristics: 0.1616
ResourceSize: 0.0617
BitcoinAddresses: 0.3877
