In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Create LightGBM datasets
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

In [None]:
# Parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'seed': 42
}

In [None]:
# Train with callback-based early stopping
model = lgb.train(params,
                  train_data,
                  valid_sets=[train_data, val_data],
                  num_boost_round=200,
                  callbacks=[lgb.early_stopping(stopping_rounds=20)])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
y_pred = model.predict(X_val)
y_pred_binary = (y_pred > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_val, y_pred_binary))
print("Classification Report:\n", classification_report(y_val, y_pred_binary))

sns.heatmap(confusion_matrix(y_val, y_pred_binary), annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

lgb.plot_importance(model, max_num_features=20, importance_type='gain', figsize=(10, 6))
plt.title("Top 20 Feature Importances")
plt.show()


In [None]:
import pickle

# Save model to a file
with open('new_malware_model.pkl', 'wb') as file:
    pickle.dump(model, file)
