In [1]:
# train.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Import classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

# --------------------------
# Load and Prepare the Data
# --------------------------
# Replace 'your_dataset.csv' with your actual dataset path
df = pd.read_csv('data.csv')

# Select 10 important columns and the target label.
features = ['Machine', 'DebugSize', 'MajorImageVersion', 'ExportSize', 
            'IatVRA', 'NumberOfSections', 'SizeOfStackReserve', 
            'DllCharacteristics', 'ResourceSize', 'BitcoinAddresses']
target = 'Benign'

X = df[features]
y = df[target]

# Split the data into training and validation sets (80/20 split)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# Define the Models
# --------------------------
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}
best_accuracy = 0.0
best_model_name = None
best_model = None

# --------------------------
# Train and Evaluate Models
# --------------------------
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    
    # Predictions on the validation set
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    print(f"Accuracy for {name}: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_val, y_pred))
    
    results[name] = acc
    if acc > best_accuracy:
        best_accuracy = acc
        best_model_name = name
        best_model = model

print("\n--------------------------------")
print(f"Best Model: {best_model_name} with Accuracy: {best_accuracy:.4f}")
print("--------------------------------")

# Save the best model to disk
joblib.dump(best_model, 'best_model.pkl')
print("Best model saved to 'best_model.pkl'")



Training RandomForest...
Accuracy for RandomForest: 0.9960
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7073
           1       1.00      1.00      1.00      5424

    accuracy                           1.00     12497
   macro avg       1.00      1.00      1.00     12497
weighted avg       1.00      1.00      1.00     12497


Training LogisticRegression...
Accuracy for LogisticRegression: 0.8454
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.94      0.87      7073
           1       0.91      0.72      0.80      5424

    accuracy                           0.85     12497
   macro avg       0.86      0.83      0.84     12497
weighted avg       0.85      0.85      0.84     12497


Training DecisionTree...
Accuracy for DecisionTree: 0.9937
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0

Parameters: { "use_label_encoder" } are not used.



Accuracy for XGBoost: 0.9953
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7073
           1       0.99      1.00      0.99      5424

    accuracy                           1.00     12497
   macro avg       1.00      1.00      1.00     12497
weighted avg       1.00      1.00      1.00     12497


--------------------------------
Best Model: RandomForest with Accuracy: 0.9960
--------------------------------
Best model saved to 'best_model.pkl'


In [6]:
X.tail()

Unnamed: 0,Machine,DebugSize,MajorImageVersion,ExportSize,IatVRA,NumberOfSections,SizeOfStackReserve,DllCharacteristics,ResourceSize,BitcoinAddresses
62480,332,0,1,0,4096,3,1048576,0,23504,0
62481,332,0,7,0,0,7,1048576,0,15704,0
62482,332,0,0,0,404908,11,1048576,0,2364,0
62483,332,0,0,70,4096,4,1048576,0,130296,0
62484,332,0,0,0,4096,6,1048576,0,6912,0
