In [2]:
import pandas as pd
import shap
import numpy as np
import joblib

# Step 1: Load dataset and model
dataset = pd.read_csv("final_features.csv")  # Update path if needed
X = dataset.drop(['app_name', 'class'], axis=1)
y = dataset['class']

model = joblib.load("saved_models/XGBoost.pkl")  # Or your model

# Step 2: SHAP values
explainer = shap.Explainer(model, X)
shap_values = explainer(X)

# Step 3: Mean SHAP per feature
mean_shap = np.abs(shap_values.values).mean(axis=0)

# Step 4: Filter out features with SHAP = 0
important_features = X.columns[mean_shap != 0]
dropped_features = X.columns[mean_shap == 0]

# Step 5: Filter dataset and combine with app_name & class
X_filtered = X[important_features]
filtered_dataset = pd.concat([dataset[['app_name', 'class']], X_filtered], axis=1)

# Step 6: Save filtered dataset to CSV
filtered_dataset.to_csv("filtered_dataset.csv", index=False)

# Step 7: Save kept & dropped features to Excel
feature_df = pd.DataFrame({
    'Feature': X.columns,
    'Mean_SHAP_Value': mean_shap,
    'Kept': mean_shap != 0
})
feature_df.to_excel("shap_feature_importance.xlsx", index=False)

print("✅ Files saved:")
print("• filtered_dataset.csv (cleaned dataset)")
print("• shap_feature_importance.xlsx (with kept/dropped features)")




✅ Files saved:
• filtered_dataset.csv (cleaned dataset)
• shap_feature_importance.xlsx (with kept/dropped features)


In [5]:
dc = pd.read_csv("filtered_dataset.csv")
dc

Unnamed: 0,app_name,class,P_BLUETOOTH_ADMIN,P_READ_PHONE_STATE,P_SEND_SMS,A_ACCESSIBILITYEVENT,A_ACTIVITYMANAGER,A_ADREQUEST,A_ALERTDIALOG,A_AN,...,A_THREAD$UNCAUGHTEXCEPTIONHANDLER,A_URLENCODER,A_VALUEANIMATOR,A_VIEW,A_VIEWTREEOBSERVER,A_WEAKHASHMAP,A_WEBVIEWCLIENT,A_WRITER,A_XMLREADER,I_BOOT_COMPLETED
0,SEWOO Print Service Plugin,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Via,0,0,0,0,1,1,0,0,1,...,1,1,1,1,1,1,1,1,0,0
2,Multi Launcher,0,0,0,0,1,1,0,0,0,...,0,0,1,1,1,1,0,1,0,1
3,Sharp Shooter,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,0,0
4,RAD Calc,0,0,0,0,0,0,0,1,0,...,1,1,1,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,MX-Player,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1034,Android 系统服务,1,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1035,Install,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1036,BlackList Pro,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
input_dc  = dc.drop(['app_name', 'class'], axis=1)
output_dc  = dc['class']
from sklearn.model_selection import train_test_split

# train_x , train_y , test_x , test_y  = train_test_split(input_dc , output_dc ,  test_size=0.25, random_state=42) 
train_x, test_x, train_y, test_y = train_test_split(input_dc, output_dc, test_size=0.25, random_state=42)

In [12]:
import xgboost as xgb
# Train XGBoost model and show accuracy and precision
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score

# Train the model
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(train_x, train_y)

# Predict
preds = model.predict(test_x)

# Calculate accuracy and precision
accuracy = accuracy_score(test_y, preds)
precision = precision_score(test_y, preds, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

Accuracy: 0.9885
Precision: 0.9885


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
