In [None]:
# SECTION 1: INSTALLATIONS & IMPORTS
# ---------------------------------
from google.colab import drive
import pandas as pd

In [None]:
# MOUNT GOOGLE COLAB
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# File path to your dataset
FILE_PATH = "/content/drive/My Drive/cybersecurity_attacks.csv"

In [None]:
print("✅ Dataset loaded successfully!")
print(f"📊 Total records: {len(df)}")

# Check if running in Jupyter to use display(), else use print()
try:
    from IPython.display import display
    display(df.head(10))  # Works in Jupyter Notebook
except ImportError:
    print(df.head(10).to_string())  # Works in standard Python scripts

✅ Dataset loaded successfully!
📊 Total records: 40000


Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Payload Data,...,Malware_Probability,Alert_Pred,Alert_Prediction,Alert_Probability,Attack_Encoded,Predicted_Attack,Action_Encoded,Recommended_Action,Severity_Encoded,Predicted_Severity
0,30-05-2023 06:33,103.216.15.12,84.9.164.252,31225,17616,ICMP,503,Data,HTTP,Qui natus odio asperiores nam. Optio nobis ius...,...,0.74,0,0,0.17,2,Malware,2,Logged,1,Low
1,26-08-2020 07:08,78.199.217.198,66.191.137.154,17245,48166,ICMP,1174,Data,HTTP,Aperiam quos modi officiis veritatis rem. Omni...,...,0.84,0,0,0.11,2,Malware,0,Blocked,1,Low
2,13-11-2022 08:23,63.79.210.48,198.219.82.17,16811,53600,UDP,306,Control,HTTP,Perferendis sapiente vitae soluta. Hic delectu...,...,0.87,1,1,0.75,0,DDoS,1,Ignored,1,Low
3,02-07-2023 10:38,163.42.196.10,101.228.192.255,20018,32534,UDP,385,Data,HTTP,Totam maxime beatae expedita explicabo porro l...,...,0.1,1,1,0.8,2,Malware,0,Blocked,2,Medium
4,16-07-2023 13:11,71.166.185.76,189.243.174.238,6131,26646,TCP,1462,Data,DNS,Odit nesciunt dolorem nisi iste iusto. Animi v...,...,0.15,1,1,0.81,0,DDoS,0,Blocked,1,Low
5,28-10-2022 13:14,198.102.5.160,147.190.155.133,17430,52805,UDP,1423,Data,HTTP,Repellat quas illum harum fugit incidunt exerc...,...,0.09,0,0,0.1,2,Malware,2,Logged,2,Medium
6,16-05-2022 17:55,97.253.103.59,77.16.101.53,26562,17416,TCP,379,Data,DNS,Qui numquam inventore repellat ratione fugit o...,...,0.2,0,0,0.19,0,DDoS,1,Ignored,0,High
7,12-02-2023 07:13,11.48.99.245,178.157.14.116,34489,20396,ICMP,1022,Data,DNS,Amet libero optio quidem praesentium libero. E...,...,0.87,1,1,0.7,1,Intrusion,2,Logged,0,High
8,27-06-2023 11:02,49.32.208.167,72.202.237.9,56296,20857,TCP,1281,Control,FTP,Veritatis nihil amet atque molestias aperiam m...,...,0.74,1,1,0.81,1,Intrusion,0,Blocked,0,High
9,15-08-2021 22:29,114.109.149.113,160.88.194.172,37918,50039,UDP,224,Data,HTTP,Consequatur ipsum autem reprehenderit quae. Do...,...,0.17,1,1,0.82,2,Malware,0,Blocked,2,Medium


In [None]:
# Top 10 Suspicious IP Addresses
df = df.dropna(subset=['Anomaly Scores'])

# Identify IPs with high anomaly scores, malware, and alerts
suspicious_ips = df[
    (df['Anomaly Scores'] > df['Anomaly Scores'].quantile(0.95)) |
    (df['Malware Indicators'].notna()) |
    (df['Alerts/Warnings'].notna())
]['Source IP Address'].value_counts().reset_index()

# Rename columns
suspicious_ips.columns = ['IP Address', 'Threat Count']

# Sort by threat count
suspicious_ips = suspicious_ips.sort_values('Threat Count', ascending=False, ignore_index=True)

# Display top 10 suspicious IPs
print("🔍 Top 10 Suspicious IP Addresses:")
print(suspicious_ips.head(10))  # Use display() if in Jupyter Notebook

🔍 Top 10 Suspicious IP Addresses:
       IP Address  Threat Count
0   177.21.83.200             1
1   14.102.21.108             1
2   103.216.15.12             1
3  78.199.217.198             1
4    63.79.210.48             1
5   163.42.196.10             1
6   71.166.185.76             1
7    11.48.99.245             1
8   24.150.253.66             1
9    138.156.5.40             1


In [None]:
# Malware Prediction Results
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Encode categorical data for Malware presence
df['Malware_Pred'] = df['Malware Indicators'].notna().astype(int)

# Select features and target
X = df[['Source Port', 'Destination Port', 'Packet Length', 'Anomaly Scores']]
y = df['Malware_Pred']

# Handle missing values in features
X = X.fillna(0)  # Replace NaN with 0 or use mean/median if needed
X = X.astype(float)  # Ensure numeric types

# Train Random Forest model
model_malware = RandomForestClassifier()
model_malware.fit(X, y)

# Predict malware presence and probability
df['Malware_Prediction'] = model_malware.predict(X)
df['Malware_Probability'] = model_malware.predict_proba(X)[:, 1]

# Display results
print("🦠 Malware Prediction Results:")
print(df[['Source IP Address', 'Malware_Probability', 'Malware Indicators']].head(10))

🦠 Malware Prediction Results:
  Source IP Address  Malware_Probability Malware Indicators
0     103.216.15.12                 0.79       IoC Detected
1    78.199.217.198                 0.73       IoC Detected
2      63.79.210.48                 0.82       IoC Detected
3     163.42.196.10                 0.21                NaN
4     71.166.185.76                 0.23                NaN
5     198.102.5.160                 0.15                NaN
6     97.253.103.59                 0.19                NaN
7      11.48.99.245                 0.87       IoC Detected
8     49.32.208.167                 0.75       IoC Detected
9   114.109.149.113                 0.14                NaN


In [None]:
# Alert/Warning Prediction Results
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Encode Alerts/Warnings as a binary feature
df['Alert_Pred'] = df['Alerts/Warnings'].notna().astype(int)

# Define feature set (ensure relevant features for alerts)
X_alert = df[['Source Port', 'Destination Port', 'Packet Length', 'Anomaly Scores']]

# Handle missing values in features
X_alert = X_alert.fillna(0)  # Replace NaN with 0 or use a more suitable method
X_alert = X_alert.astype(float)  # Ensure numeric data

# Train Random Forest model for alert prediction
model_alert = RandomForestClassifier()
model_alert.fit(X_alert, df['Alert_Pred'])

# Predict alerts/warnings
df['Alert_Prediction'] = model_alert.predict(X_alert)
df['Alert_Probability'] = model_alert.predict_proba(X_alert)[:, 1]

# Display results
print("🚨 Alert/Warning Prediction Results:")
print(df[['Source IP Address', 'Alert_Probability', 'Alerts/Warnings']].head(10))

0     103.216.15.12               0.14              NaN
1    78.199.217.198               0.14              NaN
2      63.79.210.48               0.75  Alert Triggered
3     163.42.196.10               0.76  Alert Triggered
4     71.166.185.76               0.83  Alert Triggered
5     198.102.5.160               0.18              NaN
6     97.253.103.59               0.12              NaN
7      11.48.99.245               0.76  Alert Triggered
8     49.32.208.167               0.88  Alert Triggered
9   114.109.149.113               0.79  Alert Triggered


In [None]:
# Predicted Attack Types
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Encode 'Attack Type' with missing values handled
le_attack = LabelEncoder()
df['Attack_Encoded'] = le_attack.fit_transform(df['Attack Type'].fillna('Normal'))

# Define relevant features for attack prediction
X_attack = df[['Source Port', 'Destination Port', 'Packet Length', 'Anomaly Scores']]

# Handle missing values
X_attack = X_attack.fillna(0)  # Replace NaN with 0 or an appropriate value
X_attack = X_attack.astype(float)  # Ensure numeric data

# Train Random Forest model for attack type prediction
model_attack = RandomForestClassifier()
model_attack.fit(X_attack, df['Attack_Encoded'])

# Predict attack types
df['Predicted_Attack'] = le_attack.inverse_transform(model_attack.predict(X_attack))

# Display results
print("⚔️ Predicted Attack Types:")
print(df[['Source IP Address', 'Predicted_Attack', 'Attack Type']].head(10))

⚔️ Predicted Attack Types:
  Source IP Address Predicted_Attack Attack Type
0     103.216.15.12          Malware     Malware
1    78.199.217.198          Malware     Malware
2      63.79.210.48             DDoS        DDoS
3     163.42.196.10          Malware     Malware
4     71.166.185.76             DDoS        DDoS
5     198.102.5.160          Malware     Malware
6     97.253.103.59             DDoS        DDoS
7      11.48.99.245        Intrusion   Intrusion
8     49.32.208.167        Intrusion   Intrusion
9   114.109.149.113          Malware     Malware


In [None]:
# Recommended Actions
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Encode 'Action Taken' with missing values handled
le_action = LabelEncoder()
df['Action_Encoded'] = le_action.fit_transform(df['Action Taken'].fillna('None'))

# Define relevant features for action prediction
X_action = df[['Source Port', 'Destination Port', 'Packet Length', 'Anomaly Scores']]

# Handle missing values
X_action = X_action.fillna(0)  # Replace NaN with 0 or an appropriate value
X_action = X_action.astype(float)  # Ensure numeric data

# Train Random Forest model for action prediction
model_action = RandomForestClassifier()
model_action.fit(X_action, df['Action_Encoded'])

# Predict recommended actions
df['Recommended_Action'] = le_action.inverse_transform(model_action.predict(X_action))

# Display results
print("🛡️ Recommended Actions:")
print(df[['Source IP Address', 'Recommended_Action', 'Action Taken']].head(10))

🛡️ Recommended Actions:
  Source IP Address Recommended_Action Action Taken
0     103.216.15.12             Logged       Logged
1    78.199.217.198            Blocked      Blocked
2      63.79.210.48            Ignored      Ignored
3     163.42.196.10            Blocked      Blocked
4     71.166.185.76            Blocked      Blocked
5     198.102.5.160             Logged       Logged
6     97.253.103.59            Ignored      Ignored
7      11.48.99.245             Logged       Logged
8     49.32.208.167            Blocked      Blocked
9   114.109.149.113            Blocked      Blocked


In [None]:
#Predicted Severity Levels
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Encode 'Severity Level' with missing values handled
le_severity = LabelEncoder()
df['Severity_Encoded'] = le_severity.fit_transform(df['Severity Level'].fillna('Low'))

# Define relevant features for severity prediction
X_severity = df[['Source Port', 'Destination Port', 'Packet Length', 'Anomaly Scores']]

# Handle missing values
X_severity = X_severity.fillna(0)  # Replace NaN with 0 or an appropriate value
X_severity = X_severity.astype(float)  # Ensure numeric data

# Train Random Forest model for severity prediction
model_severity = RandomForestClassifier()
model_severity.fit(X_severity, df['Severity_Encoded'])

# Predict severity levels
df['Predicted_Severity'] = le_severity.inverse_transform(model_severity.predict(X_severity))

# Display results
print("⚠️ Predicted Severity Levels:")
print(df[['Source IP Address', 'Predicted_Severity', 'Severity Level']].head(10))

⚠️ Predicted Severity Levels:
  Source IP Address Predicted_Severity Severity Level
0     103.216.15.12                Low            Low
1    78.199.217.198                Low            Low
2      63.79.210.48                Low            Low
3     163.42.196.10             Medium         Medium
4     71.166.185.76                Low            Low
5     198.102.5.160             Medium         Medium
6     97.253.103.59               High           High
7      11.48.99.245               High           High
8     49.32.208.167               High           High
9   114.109.149.113             Medium         Medium


In [None]:
# FINAL PREDICTION REPORT
# Create Final Prediction Report
final_report = df[[
    'Source IP Address',
    'Malware_Probability',
    'Alert_Probability',
    'Predicted_Attack',
    'Recommended_Action',
    'Predicted_Severity'
]].copy()  # Ensures modifications don't affect the original dataframe

# Handle missing values
final_report = final_report.fillna("Unknown")  # Replace NaN with "Unknown" for categorical or 0 for numeric

# Sort by Malware Probability (Highest Threat First)
final_report = final_report.sort_values('Malware_Probability', ascending=False)

# Display Report
print("📜 FINAL PREDICTION REPORT:")
print(final_report.head(20).to_string())  # Ensures compatibility outside Jupyter

📜 FINAL PREDICTION REPORT:
      Source IP Address  Malware_Probability  Alert_Probability Predicted_Attack Recommended_Action Predicted_Severity
22982      79.232.53.71                 0.99               0.80        Intrusion            Ignored               High
15333      213.4.85.216                 0.99               0.81             DDoS            Ignored                Low
3881    212.103.246.100                 0.98               0.15          Malware            Blocked               High
38246    211.236.66.150                 0.98               0.13             DDoS             Logged                Low
34448     89.29.184.242                 0.97               0.72        Intrusion            Blocked                Low
22632   116.153.183.228                 0.97               0.16             DDoS            Ignored               High
10453    136.29.185.239                 0.97               0.77             DDoS             Logged                Low
32331     67.15.206.1