# Notebook 2: Feature Engineering

## ML-Based Drone Firmware Tampering Detection System

This notebook covers:
- Loading preprocessed data
- Feature extraction and engineering
- Creating derived features
- Encoding categorical variables
- Feature scaling and normalization
- Feature selection
- Preparing data for ML models


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from pathlib import Path
import warnings
import pickle

warnings.filterwarnings('ignore')
np.random.seed(42)

print("Libraries imported successfully!")


Libraries imported successfully!


In [None]:
# Load preprocessed data
DATA_DIR = Path('../data')
df = pd.read_csv(DATA_DIR / 'combined_preprocessed_dataset.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
display(df.head())


Dataset shape: (2000, 29)

First few rows:


Unnamed: 0,firmware_id,drone_vendor,drone_model,firmware_version,file_name,file_format,file_size_bytes,sha256_hash,is_signed,signature_type,...,string_count,num_executables,num_scripts,hardcoded_ip_count,hardcoded_url_count,crypto_function_count,boot_time_ms,emulated_syscalls,clean_label,source
0,FW0001,DJI,Phantom 4,v2.2.1,dji_phantom_4_1.fw,fw,11228452,364db767ded5545dcf1bf9646d9ecb2e302a48d2093f02...,1,RSA,...,19856,7,1,1,2,22,826,244,1,clean
1,FW0002,Parrot,Bebop 2,v2.5.1,parrot_bebop_2_2.img,img,11323815,a281375635e987be20018dc13251f4b147cc5cd2b4a992...,1,RSA,...,6090,7,0,1,2,28,1587,230,1,clean
2,FW0003,DJI,Phantom 4,v1.3.1,dji_phantom_4_3.fw,fw,9646997,8033be982a6d8027efa5eba6afabc5d332898c662e5415...,1,ECDSA,...,4339,3,1,0,1,29,1905,286,1,clean
3,FW0004,Parrot,Bebop 2,v3.6.4,parrot_bebop_2_4.bin,bin,9685577,31892069ee3c0ec8dd1ea923d15b6b547026f7b1018b91...,1,RSA,...,10679,3,1,1,2,32,2435,142,1,clean
4,FW0005,Parrot,Anafi,v1.1.2,parrot_anafi_5.fw,fw,18559643,4fc38a10767a72dba4f002236b3d263ba8910e71177a69...,1,RSA,...,2376,2,2,1,0,23,2580,90,1,clean


In [None]:
# Separate features and target
target = df['clean_label'].copy()
df_features = df.drop(['clean_label', 'firmware_id', 'sha256_hash', 'file_name', 'source'], axis=1, errors='ignore')

print(f"Features shape: {df_features.shape}")
print(f"Target shape: {target.shape}")
print(f"\nFeature columns: {list(df_features.columns)}")


Features shape: (2000, 24)
Target shape: (2000,)

Feature columns: ['drone_vendor', 'drone_model', 'firmware_version', 'file_format', 'file_size_bytes', 'is_signed', 'signature_type', 'encryption_used', 'compression_used', 'cpu_architecture', 'os_type', 'bootloader_present', 'filesystem_detected', 'entropy_score', 'avg_section_entropy', 'max_section_entropy', 'string_count', 'num_executables', 'num_scripts', 'hardcoded_ip_count', 'hardcoded_url_count', 'crypto_function_count', 'boot_time_ms', 'emulated_syscalls']


In [None]:
# Create derived features
print("Creating derived features...")

# 1. Entropy ratio features
df_features['entropy_ratio'] = df_features['max_section_entropy'] / (df_features['avg_section_entropy'] + 1e-6)
df_features['entropy_variance'] = df_features['max_section_entropy'] - df_features['avg_section_entropy']

# 2. Security risk score (higher = more suspicious)
df_features['security_risk_score'] = (
    df_features['hardcoded_ip_count'] * 2 +
    df_features['hardcoded_url_count'] * 2 +
    df_features['num_executables'] * 1.5 +
    df_features['num_scripts'] * 1.5 +
    (1 - df_features['is_signed']) * 3
)

# 3. File size normalized features
df_features['file_size_mb'] = df_features['file_size_bytes'] / (1024 * 1024)
df_features['strings_per_mb'] = df_features['string_count'] / (df_features['file_size_mb'] + 1e-6)
df_features['executables_per_mb'] = df_features['num_executables'] / (df_features['file_size_mb'] + 1e-6)

# 4. Boot time efficiency
df_features['boot_efficiency'] = df_features['file_size_mb'] / (df_features['boot_time_ms'] + 1e-6)

# 5. Crypto density
df_features['crypto_density'] = df_features['crypto_function_count'] / (df_features['num_executables'] + 1e-6)

# 6. Suspicious activity indicators
df_features['high_entropy_flag'] = (df_features['entropy_score'] > 7.5).astype(int)
df_features['long_boot_flag'] = (df_features['boot_time_ms'] > 5000).astype(int)
df_features['many_syscalls_flag'] = (df_features['emulated_syscalls'] > 1000).astype(int)

print(f"Features after engineering: {df_features.shape[1]}")
print(f"New feature names: {[col for col in df_features.columns if col not in df.columns]}")


Creating derived features...
Features after engineering: 35
New feature names: ['entropy_ratio', 'entropy_variance', 'security_risk_score', 'file_size_mb', 'strings_per_mb', 'executables_per_mb', 'boot_efficiency', 'crypto_density', 'high_entropy_flag', 'long_boot_flag', 'many_syscalls_flag']


In [None]:
# Encode categorical variables
print("Encoding categorical variables...")

categorical_cols = df_features.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Use LabelEncoder for categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_features[col] = le.fit_transform(df_features[col].astype(str))
    label_encoders[col] = le
    print(f"  Encoded {col}: {len(le.classes_)} unique values")

# Save encoders for later use
MODELS_DIR = Path('../models')
MODELS_DIR.mkdir(exist_ok=True)
with open(MODELS_DIR / 'label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)
print(f"\nLabel encoders saved to {MODELS_DIR / 'label_encoders.pkl'}")


Encoding categorical variables...
Categorical columns: ['drone_vendor', 'drone_model', 'firmware_version', 'file_format', 'signature_type', 'cpu_architecture', 'os_type', 'filesystem_detected']
  Encoded drone_vendor: 4 unique values
  Encoded drone_model: 9 unique values
  Encoded firmware_version: 430 unique values
  Encoded file_format: 3 unique values
  Encoded signature_type: 2 unique values
  Encoded cpu_architecture: 2 unique values
  Encoded os_type: 3 unique values
  Encoded filesystem_detected: 2 unique values

Label encoders saved to ..\models\label_encoders.pkl


In [None]:
# Handle infinite and NaN values
print("Handling infinite and NaN values...")
df_features = df_features.replace([np.inf, -np.inf], np.nan)
df_features = df_features.fillna(df_features.median())

print(f"NaN values remaining: {df_features.isnull().sum().sum()}")
print(f"Infinite values remaining: {np.isinf(df_features.select_dtypes(include=[np.number])).sum().sum()}")


Handling infinite and NaN values...
NaN values remaining: 0
Infinite values remaining: 0


In [None]:
# Feature scaling
print("Scaling features...")

# Use RobustScaler (less sensitive to outliers)
scaler = RobustScaler()
X_scaled = scaler.fit_transform(df_features)
X_scaled_df = pd.DataFrame(X_scaled, columns=df_features.columns, index=df_features.index)

# Save scaler
with open(MODELS_DIR / 'feature_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print(f"Scaler saved to {MODELS_DIR / 'feature_scaler.pkl'}")

print(f"\nScaled features shape: {X_scaled_df.shape}")
print(f"Scaled features statistics:")
display(X_scaled_df.describe())


Scaling features...
Scaler saved to ..\models\feature_scaler.pkl

Scaled features shape: (2000, 35)
Scaled features statistics:


Unnamed: 0,drone_vendor,drone_model,firmware_version,file_format,file_size_bytes,is_signed,signature_type,encryption_used,compression_used,cpu_architecture,...,entropy_variance,security_risk_score,file_size_mb,strings_per_mb,executables_per_mb,boot_efficiency,crypto_density,high_entropy_flag,long_boot_flag,many_syscalls_flag
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,-0.2395,0.1658,-0.0028,-0.0145,-0.002316,0.0,0.2495,0.481,-0.483,0.0,...,0.046257,0.173808,-0.002316,0.5842465,0.500918,0.339773,0.3097216,0.0,0.3355,0.3065
std,0.563958,0.543229,0.58353,0.403883,0.57046,0.500125,0.432832,0.499764,0.499836,0.500125,...,0.720554,0.540192,0.57046,1.657621,1.562014,0.93641,1.130695,0.500125,0.472283,0.461155
min,-1.0,-0.6,-1.011765,-0.5,-0.986896,-0.5,0.0,0.0,-1.0,-0.5,...,-1.664286,-0.646154,-0.986896,-0.7278539,-0.65815,-0.623402,-0.9484529,-0.5,0.0,0.0
25%,-0.5,-0.4,-0.509412,-0.5,-0.497851,-0.5,0.0,0.0,-1.0,-0.5,...,-0.492857,-0.323077,-0.497851,-0.3338793,-0.333525,-0.307288,-0.3888816,-0.5,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.166576e-17,0.0,0.0,-8.272463000000001e-17,0.0,0.0,0.0
75%,0.5,0.6,0.490588,0.5,0.502149,0.5,0.0,1.0,0.0,0.5,...,0.507143,0.676923,0.502149,0.6661207,0.666475,0.692712,0.6111184,0.5,1.0,1.0
max,0.5,1.0,1.007059,0.5,0.95926,0.5,1.0,1.0,0.0,0.5,...,2.178571,1.384615,0.95926,11.47331,13.45831,4.941761,6.383853,0.5,1.0,1.0


In [None]:
# Feature selection using mutual information
print("Performing feature selection...")

# Select top features using mutual information
n_features = min(30, X_scaled_df.shape[1])  # Select top 30 features
selector = SelectKBest(score_func=mutual_info_classif, k=n_features)
X_selected = selector.fit_transform(X_scaled_df, target)
selected_features = X_scaled_df.columns[selector.get_support()].tolist()

print(f"Selected {len(selected_features)} features out of {X_scaled_df.shape[1]}")
print(f"\nTop selected features:")
for i, feat in enumerate(selected_features, 1):
    print(f"{i}. {feat}")

# Save selector
with open(MODELS_DIR / 'feature_selector.pkl', 'wb') as f:
    pickle.dump(selector, f)
print(f"\nFeature selector saved to {MODELS_DIR / 'feature_selector.pkl'}")


Performing feature selection...
Selected 30 features out of 35

Top selected features:
1. drone_model
2. is_signed
3. signature_type
4. encryption_used
5. compression_used
6. cpu_architecture
7. os_type
8. bootloader_present
9. filesystem_detected
10. entropy_score
11. avg_section_entropy
12. max_section_entropy
13. num_executables
14. num_scripts
15. hardcoded_ip_count
16. hardcoded_url_count
17. crypto_function_count
18. boot_time_ms
19. emulated_syscalls
20. entropy_ratio
21. entropy_variance
22. security_risk_score
23. file_size_mb
24. strings_per_mb
25. executables_per_mb
26. boot_efficiency
27. crypto_density
28. high_entropy_flag
29. long_boot_flag
30. many_syscalls_flag

Feature selector saved to ..\models\feature_selector.pkl


In [None]:
# Create final feature dataset
X_final = pd.DataFrame(X_selected, columns=selected_features, index=df_features.index)
y_final = target.values

print(f"Final feature matrix shape: {X_final.shape}")
print(f"Final target shape: {y_final.shape}")
print(f"\nTarget distribution:")
print(pd.Series(y_final).value_counts())


Final feature matrix shape: (2000, 30)
Final target shape: (2000,)

Target distribution:
1    1000
0    1000
Name: count, dtype: int64


In [None]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining label distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest label distribution:")
print(pd.Series(y_test).value_counts())


Training set: 1600 samples
Test set: 400 samples

Training label distribution:
1    800
0    800
Name: count, dtype: int64

Test label distribution:
0    200
1    200
Name: count, dtype: int64


In [None]:
# Save processed datasets
DATA_DIR = Path('../data')
X_train.to_csv(DATA_DIR / 'X_train.csv', index=False)
X_test.to_csv(DATA_DIR / 'X_test.csv', index=False)
pd.Series(y_train).to_csv(DATA_DIR / 'y_train.csv', index=False, header=['label'])
pd.Series(y_test).to_csv(DATA_DIR / 'y_test.csv', index=False, header=['label'])

print("Processed datasets saved:")
print(f"  - {DATA_DIR / 'X_train.csv'}")
print(f"  - {DATA_DIR / 'X_test.csv'}")
print(f"  - {DATA_DIR / 'y_train.csv'}")
print(f"  - {DATA_DIR / 'y_test.csv'}")

# Also save full dataset for time-series models
X_final.to_csv(DATA_DIR / 'X_final.csv', index=False)
pd.Series(y_final).to_csv(DATA_DIR / 'y_final.csv', index=False, header=['label'])

print(f"\nFull dataset saved:")
print(f"  - {DATA_DIR / 'X_final.csv'}")
print(f"  - {DATA_DIR / 'y_final.csv'}")


Processed datasets saved:
  - ..\data\X_train.csv
  - ..\data\X_test.csv
  - ..\data\y_train.csv
  - ..\data\y_test.csv

Full dataset saved:
  - ..\data\X_final.csv
  - ..\data\y_final.csv
