In [3]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [4]:
main_df = pd.read_csv("C:\\Users\\Awerfast\\Documents\\Swinburne\\cos30049\\_testing\\AI_testing\\API_Functions\\API_Functions.csv")


In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
import numpy as np

In [6]:
# Explore the data structure
print("Dataset shape:", main_df.shape)
print("\nColumns:", main_df.columns.tolist())
print("\nData types:")
print(main_df.dtypes.value_counts())
print("\nTarget variable distribution:")
print(main_df['Type'].value_counts())
print("\nFirst few rows:")
print(main_df.head())

# Check for missing values
print(f"\nMissing values per column:")
missing_values = main_df.isnull().sum()
print(f"Total columns with missing values: {(missing_values > 0).sum()}")
print(f"Total missing values: {missing_values.sum()}")

# Show columns with most missing values (top 10)
if missing_values.sum() > 0:
    print("\nTop 10 columns with missing values:")
    print(missing_values[missing_values > 0].sort_values(ascending=False).head(10))

#class distribution percentage plot on bar chart
class_distribution = main_df['Type'].value_counts(normalize=True) * 100
fig = go.Figure(data=[go.Bar(x=class_distribution.index, y=class_distribution.values)])
fig.update_layout(title='Class Distribution (%)', xaxis_title='Class', yaxis_title='Percentage')
fig.show()

# Separate features and target
X = main_df.drop(columns=['SHA256', 'Type'])
y = main_df['Type']

print(f"\nOriginal data shape: {X.shape}")
print(f"Missing values in X: {X.isnull().sum().sum()}")

# Handle missing values - fill with 0 (appropriate for malware features)
X_clean = X.fillna(0)
print(f"Missing values after cleaning: {X_clean.isnull().sum().sum()}")

# Apply SMOTE to the cleaned data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_clean, y)

print(f"\nOriginal class distribution:")
print(pd.Series(y).value_counts())
print(f"\nResampled class distribution:")
print(pd.Series(y_resampled).value_counts())

#plot the new class distribution
resampled_class_distribution = pd.Series(y_resampled).value_counts(normalize=True) * 100
fig = go.Figure(data=[go.Bar(x=resampled_class_distribution.index, y=resampled_class_distribution.values)])
fig.update_layout(title='Resampled Class Distribution (%)', xaxis_title='Class', yaxis_title='Percentage')
fig.show()
warnings.filterwarnings('ignore')

# Use the SMOTE-resampled data from the previous cell
# Convert resampled data back to DataFrame for consistency
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_series = pd.Series(y_resampled)

print("Using SMOTE-resampled balanced dataset:")
print(f"Original dataset shape: {X.shape}")
print(f"Resampled dataset shape: {X_resampled_df.shape}")

# Handle any missing values (though SMOTE shouldn't introduce any)
X_resampled_df = X_resampled_df.fillna(0)


Dataset shape: (29505, 21920)


Data types:
int64     21919
object        1
Name: count, dtype: int64

Target variable distribution:
Type
4    5076
1    5022
3    4957
2    4643
5    4231
6    3699
0    1877
Name: count, dtype: int64

First few rows:
                                              SHA256  Type  getaclinformation  \
0  002ce0d28ec990aadbbc89df457189de37d8adaadc9c08...     0                  1   
1  dacbe8cb72dd746539792a50e84965fefef73feaa07b5d...     0                  0   
2  d3dc7512ce75db33b2c3063fa99245e9ca9fe3b086462f...     0                  0   
3  b350fac81533f02981dc2176ed17163177d92d9405758e...     0                  0   
4  dfee618043a47b7b09305df0ca460559d9f567ee246c7b...     0                  0   

   getace  getsecuritydescriptordacl  regqueryvalueexa  regopenkeyexa  \
0       1                          1                 1              1   
1       0                          0                 0              0   
2       0                          0       


Original data shape: (29505, 21918)
Missing values in X: 0
Missing values after cleaning: 0

Original class distribution:
Type
4    5076
1    5022
3    4957
2    4643
5    4231
6    3699
0    1877
Name: count, dtype: int64

Resampled class distribution:
Type
0    5076
1    5076
2    5076
3    5076
4    5076
5    5076
6    5076
Name: count, dtype: int64


Using SMOTE-resampled balanced dataset:
Original dataset shape: (29505, 21918)
Resampled dataset shape: (35532, 21918)


In [7]:

# Encode the target variable
label_encoder = LabelEncoder()
y_resampled_encoded = label_encoder.fit_transform(y_resampled_series)

print(f"Feature matrix shape: {X_resampled_df.shape}")
print(f"Target classes: {label_encoder.classes_}")
print(f"Number of samples per class (after SMOTE): {np.bincount(y_resampled_encoded)}")

# Split the resampled data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled_df, y_resampled_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_resampled_encoded
)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
print(f"Training class distribution: {np.bincount(y_train)}")
print(f"Test class distribution: {np.bincount(y_test)}")

Feature matrix shape: (35532, 21918)
Target classes: [0 1 2 3 4 5 6]
Number of samples per class (after SMOTE): [5076 5076 5076 5076 5076 5076 5076]
Training set: (28425, 21918), Test set: (7107, 21918)
Training class distribution: [4061 4061 4061 4060 4061 4061 4060]
Test class distribution: [1015 1015 1015 1016 1015 1015 1016]


In [None]:


# Random Forest for Feature Selection
print("Training Random Forest for feature importance...")

# Train Random Forest with optimal parameters for feature selection
rf_feature_selector = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    max_depth=10,
    class_weight= 'balanced'
)

rf_feature_selector.fit(X_train, y_train)

# Get feature importances
feature_names = X.columns.tolist()
feature_importances = rf_feature_selector.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

# Visualize Feature Importance (All features ranked in descending order) with Plotly
top_n = len(feature_importance_df)  # Show all features
top_features = feature_importance_df.head(top_n)

# Create horizontal bar chart with Plotly (fixed - single trace with text)
fig = go.Figure()
fig.add_trace(go.Bar(
    x=top_features['importance'],
    y=top_features['feature'],
    orientation='h',
    text=[f'{val:.4f}' for val in top_features['importance']],
    textposition='auto',
    marker=dict(color='skyblue', line=dict(color='darkblue', width=0.5))
))

fig.update_layout(
    title=f'Top {top_n} Feature Importances (Random Forest)',
    xaxis_title='Feature Importance',
    yaxis_title='Features',
    height=max(800, top_n * 20),  # Adjust height based on number of features
    yaxis=dict(autorange="reversed"),  # Show most important features at top
    showlegend=False
)

fig.show()

# Create subplots for distribution and cumulative importance
fig_analysis = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Distribution of Feature Importances', 'Cumulative Feature Importance'),
    horizontal_spacing=0.1
)

# Distribution histogram
fig_analysis.add_trace(
    go.Histogram(
        x=feature_importance_df['importance'],
        nbinsx=50,
        marker=dict(color='skyblue', opacity=0.7),
        name='Distribution'
    ),
    row=1, col=1
)

# Cumulative importance line plot
cumsum_importance = np.cumsum(feature_importance_df['importance'].values)
fig_analysis.add_trace(
    go.Scatter(
        x=list(range(1, len(cumsum_importance) + 1)),
        y=cumsum_importance,
        mode='lines',
        line=dict(color='blue', width=2),
        name='Cumulative'
    ),
    row=1, col=2
)

# Update layout for subplots
fig_analysis.update_xaxes(title_text="Feature Importance", row=1, col=1)
fig_analysis.update_yaxes(title_text="Number of Features", row=1, col=1)
fig_analysis.update_xaxes(title_text="Number of Features (ranked by importance)", row=1, col=2)
fig_analysis.update_yaxes(title_text="Cumulative Importance", row=1, col=2)

fig_analysis.update_layout(
    height=500,
    showlegend=False,
    title_text="Feature Importance Analysis"
)

fig_analysis.show()

# Print statistics about feature importance
print(f"Total number of features: {len(feature_importance_df)}")
print(f"Top 10 features contribute: {feature_importance_df.head(10)['importance'].sum():.3f} ({feature_importance_df.head(10)['importance'].sum()*100:.1f}%)")
print(f"Top 50 features contribute: {feature_importance_df.head(50)['importance'].sum():.3f} ({feature_importance_df.head(50)['importance'].sum()*100:.1f}%)")
print(f"Top 100 features contribute: {feature_importance_df.head(100)['importance'].sum():.3f} ({feature_importance_df.head(100)['importance'].sum()*100:.1f}%)")

Training Random Forest for feature importance...


Total number of features: 21918
Top 10 features contribute: 0.082 (8.2%)
Top 50 features contribute: 0.286 (28.6%)
Top 100 features contribute: 0.445 (44.5%)


In [9]:
for i in range(1, 27):
    print(f"Top {i*100} features contribute: {feature_importance_df.head(i*100)['importance'].sum():.3f} ({feature_importance_df.head(i*100)['importance'].sum()*100:.1f}%)")
    

Top 100 features contribute: 0.445 (44.5%)
Top 200 features contribute: 0.632 (63.2%)
Top 300 features contribute: 0.736 (73.6%)
Top 400 features contribute: 0.803 (80.3%)
Top 500 features contribute: 0.851 (85.1%)
Top 600 features contribute: 0.887 (88.7%)
Top 700 features contribute: 0.913 (91.3%)
Top 800 features contribute: 0.932 (93.2%)
Top 900 features contribute: 0.946 (94.6%)
Top 1000 features contribute: 0.957 (95.7%)
Top 1100 features contribute: 0.966 (96.6%)
Top 1200 features contribute: 0.973 (97.3%)
Top 1300 features contribute: 0.978 (97.8%)
Top 1400 features contribute: 0.982 (98.2%)
Top 1500 features contribute: 0.986 (98.6%)
Top 1600 features contribute: 0.989 (98.9%)
Top 1700 features contribute: 0.991 (99.1%)
Top 1800 features contribute: 0.993 (99.3%)
Top 1900 features contribute: 0.995 (99.5%)
Top 2000 features contribute: 0.996 (99.6%)
Top 2100 features contribute: 0.997 (99.7%)
Top 2200 features contribute: 0.997 (99.7%)
Top 2300 features contribute: 0.998 (99.8

In [10]:
#print list of top 700 features
print(feature_importance_df.head(700)['feature'].tolist())

['adjfdivm64', 'loadicona', 'getprocaddress', 'lcmapstringa', 'cisqrt', 'updatewindow', 'vbafpexception', 'corexemain', 'heapalloc', 'getconsolecp', 'translatemessage', 'unhandledexceptionfilter', 'vbaobjsetaddref', 'getconsolemode', 'ciatan', 'cicos', 'adjfdivm32', 'vbaaryconstruct2', 'widechartomultibyte', 'freeenvironmentstringsa', 'adjfdivrm32', 'polyline', 'eventsinkqueryinterface', 'tlsfree', 'cilog', 'eventsinkaddref', 'vbastrcopy', 'setunhandledexceptionfilter', 'heapfree', 'getstartupinfoa', 'postquitmessage', 'loadlibrarya', 'loadcursora', 'tlsalloc', 'virtualalloc', 'adjfdivm16i', 'getcurrentthreadid', 'amsgexit', 'initializecriticalsectionandspincount', 'getenvironmentstrings', 'entercriticalsection', 'getsystemtimeasfiletime', 'getstringtypea', 'beginpaint', 'interlockedincrement', 'escape', 'showwindow', 'vbaaryunlock', 'vbavarmove', 'getcurrentprocess', 'getstdhandle', 'vbafpi2', 'adjfdivrm16i', 'vbaexcepthandler', 'vbastrcat', 'releasecapture', 'multibytetowidechar', 'a

In [11]:
#select top 700 features from balanced dataset and write to csv
top_700_features = feature_importance_df.head(700)['feature'].tolist()
X_top_700 = X_resampled_df[top_700_features]
balanced_top_700_df = pd.concat([X_top_700, y_resampled_series.reset_index(drop=True)], axis=1)
balanced_top_700_df.to_csv("balanced_top_700_features.csv", index=False)