In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# Load the datasets using relative paths (Dataset folder at repository root)
from pathlib import Path
dataset_dir = Path('.') / 'Dataset'

In [2]:
df_api_call = pd.read_csv(dataset_dir / 'API_Functions.csv')


In [3]:
# Check the shape and structure of the data
print(f"Dataset shape: {df_api_call.shape}")
print(f"\nFirst few columns: {df_api_call.columns[:10].tolist()}")
print(f"\nData info:")
print(df_api_call.info())

Dataset shape: (29505, 21920)

First few columns: ['SHA256', 'Type', 'getaclinformation', 'getace', 'getsecuritydescriptordacl', 'regqueryvalueexa', 'regopenkeyexa', 'getsecurityinfo', 'isvalidsid', 'regclosekey']

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29505 entries, 0 to 29504
Columns: 21920 entries, SHA256 to setupdigethwprofilefriendlynameexw
dtypes: int64(21919), object(1)
memory usage: 4.8+ GB
None


In [4]:
# Import XGBoost
import xgboost as xgb
import numpy as np

# Separate features and target
# Assuming 'Type' is the target column and 'SHA256' is the identifier
X = df_api_call.drop(['SHA256', 'Type'], axis=1)
y = df_api_call['Type']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")

Features shape: (29505, 21918)
Target shape: (29505,)
Target distribution:
Type
4    5076
1    5022
3    4957
2    4643
5    4231
6    3699
0    1877
Name: count, dtype: int64


In [5]:
# Train XGBoost model for feature importance
print("Training XGBoost model for feature extraction...")

# Create XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',  # For multiclass probability output
    num_class=7,
    eval_metric='mlogloss',
    n_estimators=200,
    max_depth=12,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=1,
    reg_lambda=0.3,
    reg_alpha=0.3,
    random_state=42,
    n_jobs=-1,
    #early_stopping_rounds=50,
    verbosity=1
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
xgb_model.fit(X_train, y_train)

print("Model training completed!")

Training XGBoost model for feature extraction...
Model training completed!


In [6]:
# Extract feature importance
feature_importance = xgb_model.feature_importances_
feature_names = X.columns.tolist()

# Create a dataframe with feature names and their importance scores
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
})

# Sort by importance and get top 500 features
top_500_features = feature_importance_df.sort_values('importance', ascending=False).head(500)

print(f"Top 500 features extracted!")
print(f"\nTop 10 most important features:")
print(top_500_features.head(10))
print(f"\nImportance score range: {top_500_features['importance'].min():.6f} to {top_500_features['importance'].max():.6f}")

Top 500 features extracted!

Top 10 most important features:
                      feature  importance
19319  oleuiobjectpropertiesa    0.062465
17390      pathfindextensiona    0.052514
287               getmessagea    0.045122
1881          getdlgitemtexta    0.023483
18695         eventsinkaddref    0.023332
19288            vbaaryunlock    0.022474
4170                 polyline    0.020392
1995                 isiconic    0.020010
13035    cryptacquirecontexta    0.017255
19286              vbaarylock    0.012920

Importance score range: 0.000221 to 0.062465


In [None]:
# Create a reduced dataset with only the top 500 features
top_500_feature_names = top_500_features['feature'].tolist()

# Include SHA256 and Type columns along with top 500 features
reduced_dataset = df_api_call[['SHA256', 'Type'] + top_500_feature_names]

# Save the reduced dataset
reduced_output_file = dataset_dir / 'top_500_api_functions.csv'
reduced_dataset.to_csv(reduced_output_file, index=False)

print(f"\nReduced dataset saved to: {reduced_output_file}")
print(f"Reduced dataset shape: {reduced_dataset.shape}")
print(f"Original dataset shape: {df_api_call.shape}")
print(f"\nSize reduction: {df_api_call.shape[1]} columns -> {reduced_dataset.shape[1]} columns")

Top 500 features saved to: Dataset\top_500_features_xgboost.csv

Reduced dataset saved to: Dataset\API_Functions_top500_features.csv
Reduced dataset shape: (29505, 502)
Original dataset shape: (29505, 21920)

Size reduction: 21920 columns -> 502 columns


In [None]:
# Visualize the top 20 features
fig = px.bar(
    top_500_features.head(20),
    x='importance',
    y='feature',
    orientation='h',
    title='Top 20 Most Important Features (XGBoost)',
    labels={'importance': 'Feature Importance', 'feature': 'Feature Name'},
    color='importance',
    color_continuous_scale='Viridis'
)

fig.update_layout(
    height=600,
    yaxis={'categoryorder': 'total ascending'},
    showlegend=False
)

fig.show()

print(f"\nFeature extraction completed successfully!")
print(f"Files created:")
print(f"2. {reduced_output_file} - Reduced dataset with only top 500 features")


Feature extraction completed successfully!
Files created:
1. Dataset\top_500_features_xgboost.csv - List of top 500 features with importance scores
2. Dataset\API_Functions_top500_features.csv - Reduced dataset with only top 500 features
