<a href="https://colab.research.google.com/github/Moneymarrr38/AssignmentGithub/blob/main/ClassifyDroneCommoML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import all necessities
!pip install -q imbalanced-learn xgboost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Be sure to mount Drive to Colab and use correct path (Had to open the shared dataset in googlesheet,
# download the googlesheet to computer as .csv file, then upload the file to Google Drive)
data_path = '/content/drive/MyDrive/drone_communication_dataset - drone_communication_dataset.csv'

df = pd.read_csv(data_path)

In [None]:
print("Dataset Shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nFirst 5 rows:\n", df.head())
print("\nLabel Distribution:\n", df['Target Label'].value_counts(dropna=False))

# Define Benign/Normal Communications as 0 and Malicious/Anomalous as 1
def map_to_binary(label):
  if pd.isna(label):
    return None
  benign_patterns = ['Normal Communication']
  if str(label).strip() in benign_patterns:
    return 0
  else:
    return 1

df['binary_label'] = df['Target Label'].apply(map_to_binary)

Dataset Shape: (52585, 29)

Columns:
 ['Timestamp', 'Source Drone ID', 'Destination ID', 'Packet Size', 'Transmission Rate', 'Signal Strength', 'Error Rate', 'Encryption Status', 'Protocol Type', 'Response Time', 'Battery Level', 'GPS Coordinates', 'Payload Type', 'Packet Loss Rate', 'Connection Duration', 'Round Trip Time', 'Hop Count', 'Jitter', 'Drone Velocity', 'Signal-to-Noise Ratio', 'Data Throughput', 'Port Number', 'Communication Interval', 'Control Command Frequency', 'Drone Altitude', 'CPU Usage', 'Memory Utilization', 'Distance to Base Station', 'Target Label']

First 5 rows:
             Timestamp Source Drone ID Destination ID  Packet Size  \
0  2018-01-01 0:00:00              D1             D1         2000   
1  2018-01-01 1:00:00              D3             D2          500   
2  2018-01-01 2:00:00              D2    BaseStation          500   
3  2018-01-01 3:00:00              D1             D3         2000   
4  2018-01-01 4:00:00              D3             D2        

In [None]:
# Check the new distribution
print("\nBinary Label Distribution:")
print(df['binary_label'].value_counts(normalize=True))
print(df['binary_label'].value_counts())

y = df['binary_label'].astype(int)


Binary Label Distribution:
binary_label
0    0.704364
1    0.295636
Name: proportion, dtype: float64
binary_label
0    37039
1    15546
Name: count, dtype: int64


In [None]:
# Split GPS Cooridinates column
gps_column = 'GPS Coordinates'
df[['Latitude', 'Longitude']] = df[gps_column].str.split(',', expand=True)
df['Latitude'] = df['Latitude'].str.strip().astype(float, errors='ignore')
df['Longitude'] = df['Longitude'].str.strip().astype(float, errors='ignore')

# New Attack Types
new_attacks = ['Replay Attack', 'GeoSpoofing', 'Jamming', 'Man-in-the-Middle']

# Sample anomalous rows to base new data on
anom_df = df[df['binary_label']==1]
num_new_per_type = len(anom_df) // len(new_attacks) # Balance addition

expanded_dfs = [df.copy()]
for attack in new_attacks:
  new_df = anom_df.sample(num_new_per_type, replace=True).copy()
  new_df['Target Label'] = attack
  new_df['binary_label'] = 1

  # Perturb features for realism
  if attack == 'Replay Attack':
    new_df['Round Trip Time'] *= 1.5  # Increased Delay
    new_df['Packet Loss Rate'] += np.random.uniform(0.1, 0.3, len(new_df))
  elif attack == 'GeoSpoofing':
    if 'Latitude' in new_df.columns:
      new_df['Latitude'] += np.random.uniform(-0.5, 0.5, len(new_df))
      new_df['Longitude'] += np.random.uniform(-0.5, 0.5, len(new_df))
    new_df['Distance to Base Station'] *= 1.2 if 'Distance to Base Station' in new_df else 0 # Increased Distance
  elif attack == 'Jamming':
    new_df['Signal Strength'] -= np.random.uniform(10, 20, len(new_df)) # Weaker Signal
    new_df['Packet Loss Rate'] += np.random.uniform(0.4, 0.6, len(new_df))
  elif attack == 'Man-in-the-Middle':
    new_df['Encryption Status'] = 0 if 'Encryption Status' in new_df else 0 # Assume unencrypted
    new_df['Hop Count'] += 1 if 'Hop Count'in new_df else 0

  expanded_dfs.append(new_df)

df_exp = pd.concat(expanded_dfs, ignore_index=True)
print("\nExpanded Shape:", df_exp.shape)
print("New Target Label Distribution:\n", df_exp['Target Label'].value_counts())

# Save expanded
df_exp.to_csv(data_path + 'expanded_drone_data.csv', index=False)


Expanded Shape: (68129, 32)
New Target Label Distribution:
 Target Label
Normal Communication        37039
DDoS Attack                  7843
Malware Infection            5145
Replay Attack                3886
Jamming                      3886
GeoSpoofing                  3886
Man-in-the-Middle            3886
Anomaly/Unusual Behavior     2558
Name: count, dtype: int64


In [None]:
# Create Alert Type (map from Target Label)
alert_map = {
    'Normal Communication': 'Normal',
    'DDoS Attack' : 'DDos',
    'Malware Infection' : 'Malware',
    'Replay Attack': 'Replay',
    'Jamming': 'Jamming',
    'GeoSpoofing': 'GeoSpoofing',
    'Man-in-the-Middle': 'MITM',
    'Anomaly/Unusual Behavior': 'Anomaly'

}
df_exp['Alert Type'] = df_exp['Target Label'].map(alert_map)

# Create Severity
severity_map = {
    'Normal': 'None',
    'DDos': 'Critical',
    'Malware': 'High',
    'Replay': 'Medium',
    'Jamming': 'High',
    'GeoSpoofing': 'High',
    'MITM': 'Critical',
    'Anomaly': 'Medium'
}
df_exp['Severity'] = df_exp['Alert Type'].map(severity_map)

# Logs Table
logs_columns = [
    'Timestamp',
    'Source Drone ID',
    'Destination ID',
    'Alert Type',
    'Severity',
    'Response Time',
    'Error Rate',
    'Packet Loss Rate',
    'Signal Strength',
    'Battery Level',
    'Protocol Type',
    'Port Number'
]
logs_df = df_exp[logs_columns]
print("\nLogs Table Sample:\n", logs_df.head())

# Temporal Table
temporal_columns = [
    'Timestamp',
    'Alert Type',
    'Connection Duration'
]
temporal_df = df_exp[temporal_columns]
print("\nTemporal Table Sample", temporal_df.head())

# Attack Features Table (lookup)
attack_features = pd.DataFrame({
    'Alert Type': list(severity_map.keys())[1:],
    'Severity': list(severity_map.values())[1:],
    'Description': [
        'Flooding attack overwhelming network with traffic.',
        'Malicious software infecting drone systems.',
        'Captured packets replayed to deceive systems.',
        'Interference disrupting signals.',
        'Falsified GPS signals to mislead position.',
        'Interceptor eavesdrops/alters communications',
        'Unusual patters not fitting known attacks.'
    ],
    'Indicators': [
        'High Packet Loss, increased Round Trip Time.',
        'Unusual CPU/Memory usage, anomalous payloads.',
        'Duplicated timestamps, delayed responses.',
        'Low Signal Strength, high Error Rate.',
        'Abnormal Latitude/Longitude shifts.',
        'Unexpected Hop Count, unecrypted data.',
        'Deviations in features like Jitter, Hop Count.'
    ]
})

print("\nAttack Features Table:\n", attack_features)

# Save tables
logs_df.to_csv(data_path + 'logs_table.csv', index=False)
temporal_df.to_csv(data_path + 'temporal_table.csv', index=False)
attack_features.to_csv(data_path + 'attack_features.csv', index=False)


Logs Table Sample:
             Timestamp Source Drone ID Destination ID Alert Type  Severity  \
0  2018-01-01 0:00:00              D1             D1     Normal      None   
1  2018-01-01 1:00:00              D3             D2     Normal      None   
2  2018-01-01 2:00:00              D2    BaseStation       DDos  Critical   
3  2018-01-01 3:00:00              D1             D3     Normal      None   
4  2018-01-01 4:00:00              D3             D2    Anomaly    Medium   

   Response Time  Error Rate  Packet Loss Rate  Signal Strength  \
0            100        0.05              0.00       -48.674464   
1             10        0.01              0.00       -51.596394   
2           1000        0.01              0.00       -44.884766   
3             10        0.01              0.00       -58.707466   
4            100        0.01              0.05       -44.503895   

   Battery Level Protocol Type  Port Number  
0      85.157405           UDP          443  
1      66.445555     

In [None]:
# Features from Logs
numeric_feats = [
    'Packet Size',  # bytes
    'Transmission Rate',  # packets/second
    'Response Time',  # ms
    'Battery Level',  # %
    'Latitude', # geographical location
    'Longitude',  # geographical location
    'Error Rate', # ratio of corrupted packets
    'Encryption Status',  # encrypted=1, unencrypted=0
    'Packet Loss Rate', # %
    'Connection Duration',  # seconds
    'Round Trip Time',  # ms
    'Hop Count',  # number of hops
    'Jitter', # variability in packet arrival time
    'Drone Velocity', # m/s
    'Signal-to-Noise Ratio',  # ratio of signal strength to background noise
    'Data Throughput',  # kbps
    'Port Number',  # port number used for communication
    'Signal Strength',  # dBm
    'Communication Interval', # interval between consecutive data packets
    'Control Command Frequency',  # Hz
    'Drone Altitude', # m
    'CPU Usage',  # %
    'Memory Utilization', # %
    'Distance to Base Station'  # m
]
categorical_feats = [
    'Protocol Type',
    'Payload Type'
] # One-hot

# Encode categorical
if categorical_feats:
  ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
  cat_encode = ohe.fit_transform(df_exp[categorical_feats])
  cat_df = pd.DataFrame(cat_encode, columns=ohe.get_feature_names_out(), index=df_exp.index)
  X = pd.concat([df_exp[numeric_feats], cat_df], axis=1)
else:
  X = df_exp[numeric_feats]

# Multi-class target: Alert Type (encode for XGBoost/SVM)
le = LabelEncoder()
y_multi = le.fit_transform(df_exp['Alert Type'])
y_binary = df_exp['binary_label']

# Impute/Scale
X = X.fillna(X.median())
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split/SMOTE
X_train, X_test, y_train_bin, y_test_bin = train_test_split(X_scaled, y_binary, test_size=0.2, random_state=42, stratify=y_binary)
X_train_m, X_test_m, y_train_multi, y_test_multi = train_test_split(X_scaled, y_multi, test_size=0.2, random_state=42, stratify=y_multi)
smote = SMOTE(random_state=42)
X_train_bin_res, y_train_bin_res = smote.fit_resample(X_train, y_train_bin)
X_train_multi_res, y_train_multi_res = smote.fit_resample(X_train_m, y_train_multi)

# Models
models = {
    'SVM': SVC(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss')
}

# Binary
binary_results = {}
for name, model in models.items():
  if name == 'XGBoost':
    model = XGBClassifier(random_state=42, eval_metric='logloss')
  model.fit(X_train_bin_res, y_train_bin_res)
  y_pred_bin = model.predict(X_test)
  binary_results[name] = {
      'model' : model,
      'report': classification_report(y_test_bin, y_pred_bin, output_dict=True),
      'roc_auc': roc_auc_score(y_test_bin, y_pred_bin) if len(set(y_test_bin)) else None
  }
  print(f'\n{name} Binary Report:\n', classification_report(y_test_bin, y_pred_bin, target_names=['Benign', 'Malicious']))

# Multi-Class
multi_results = {}
for name, model in models.items():
  if name == 'XGBoost':
    model = XGBClassifier(random_state=42, eval_metric='mlogloss')
  model.fit(X_train_multi_res, y_train_multi_res)
  y_pred_multi = model.predict(X_test_m)
  multi_results[name] = {
      'model' : model,
      'report': classification_report(y_test_multi, y_pred_multi, output_dict=True)
  }
  print(f'\n{name} Multi-Class Report:\n', classification_report(y_test_multi, y_pred_multi, target_names=le.classes_))


SVM Binary Report:
               precision    recall  f1-score   support

      Benign       0.66      0.83      0.74      7408
   Malicious       0.71      0.50      0.59      6218

    accuracy                           0.68     13626
   macro avg       0.69      0.67      0.66     13626
weighted avg       0.69      0.68      0.67     13626


Random Forest Binary Report:
               precision    recall  f1-score   support

      Benign       0.72      0.91      0.81      7408
   Malicious       0.85      0.58      0.69      6218

    accuracy                           0.76     13626
   macro avg       0.79      0.75      0.75     13626
weighted avg       0.78      0.76      0.75     13626


Logistic Regression Binary Report:
               precision    recall  f1-score   support

      Benign       0.64      0.79      0.71      7408
   Malicious       0.65      0.47      0.55      6218

    accuracy                           0.65     13626
   macro avg       0.65      0.63      