# Preliminary Analysis
This notebook is dedicated to answering questions applied to various filtrations of the focused dataset. It's less focused on narrative organization and more geared towards providing efficient analysis.

In [107]:
import os
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plot
import seaborn as sbn
from pprint import pprint
from typing import Dict, Tuple

# Import utility functions
src_path = os.path.abspath(os.path.join('..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from utils import *
from classes import Plotter

df = pd.read_parquet(path='../data/processed/composite/dataset_focused.parquet')

## Means & Deviations

In [102]:
df[df['date_public'].isnull()]

Unnamed: 0,cve_id,date_public,origin,cvss,cvss_severity,cvss_src,poc_code,verified,exploit_count,days_to_poc_exploit,...,percentile_0,exploitation_date_30,epss_30,percentile_30,exploitation_date_60,epss_60,percentile_60,change_0_to_30,change_30_to_60,change_0_to_60
2178,CVE-2004-6768,NaT,poc,,UNKNOWN,,,,1.0,,...,,2024-08-08 13:01:41+00:00,,,2024-09-07 13:01:41+00:00,,,,,
11244,CVE-2012-5664,NaT,poc,,UNKNOWN,,,,1.0,,...,,2013-02-02 10:07:27+00:00,,,2013-03-04 10:07:27+00:00,,,,,
12745,CVE-2014-0291,NaT,poc,,UNKNOWN,,,,1.0,,...,,2015-04-22 10:18:08+00:00,,,2015-05-22 10:18:08+00:00,,,,,
14186,CVE-2014-8729,NaT,poc,,UNKNOWN,,,,1.0,,...,,2015-04-19 22:41:03+00:00,,,2015-05-19 22:41:03+00:00,,,,,
14240,CVE-2014-91371,NaT,poc,,UNKNOWN,,,,1.0,,...,,2024-11-10 15:29:01+00:00,,,2024-12-10 15:29:01+00:00,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55462,CVE-2024-50961,NaT,poc,,UNKNOWN,,,,1.0,,...,,2024-12-13 07:43:06+00:00,,,2025-01-12 07:43:06+00:00,,,,,
55463,CVE-2024-50962,NaT,poc,,UNKNOWN,,,,1.0,,...,,2024-12-13 07:47:38+00:00,,,2025-01-12 07:47:38+00:00,,,,,
55464,CVE-2024-50964,NaT,poc,,UNKNOWN,,,,1.0,,...,,2024-12-13 07:49:09+00:00,,,2025-01-12 07:49:09+00:00,,,,,
55518,CVE-2024-51435,NaT,poc,,UNKNOWN,,,,1.0,,...,,2024-11-24 23:28:09+00:00,,,2024-12-24 23:28:09+00:00,,,,,


In [105]:
df[df['verified'] == True]

Unnamed: 0,cve_id,date_public,origin,cvss,cvss_severity,cvss_src,poc_code,verified,exploit_count,days_to_poc_exploit,...,percentile_0,exploitation_date_30,epss_30,percentile_30,exploitation_date_60,epss_60,percentile_60,change_0_to_30,change_30_to_60,change_0_to_60
2,CVE-1999-0025,1997-07-16 04:00:00+00:00,xdb,7.2,HIGH,V2,True,True,1.0,-54.0,...,,1997-06-23 00:00:00+00:00,,,1997-07-23 00:00:00+00:00,,,,,
4,CVE-1999-0034,1997-05-29 04:00:00+00:00,xdb,7.2,HIGH,V2,True,True,4.0,-363.0,...,,1996-07-01 00:00:00+00:00,,,1996-07-31 00:00:00+00:00,,,,,
6,CVE-1999-0041,1997-02-13 05:00:00+00:00,xdb,7.5,HIGH,V2,True,True,2.0,-1.0,...,,1997-03-15 00:00:00+00:00,,,1997-04-14 00:00:00+00:00,,,,,
10,CVE-1999-0068,1997-10-19 04:00:00+00:00,xdb,7.5,HIGH,V2,True,True,1.0,-1.0,...,,1997-11-18 00:00:00+00:00,,,1997-12-18 00:00:00+00:00,,,,,
12,CVE-1999-0109,1997-02-10 05:00:00+00:00,xdb,7.2,HIGH,V2,True,True,1.0,-1.0,...,,1997-03-12 00:00:00+00:00,,,1997-04-11 00:00:00+00:00,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44216,CVE-2023-23162,2023-02-10 00:00:00+00:00,xdb,9.8,CRITICAL,V3.1,True,True,1.0,52.0,...,0.25281,2023-05-03 00:00:00+00:00,0.00118,0.44394,2023-06-02 00:00:00+00:00,0.00103,0.40778,87.301587,-12.711864,63.492063
44258,CVE-2023-23488,2023-01-20 00:00:00+00:00,poc_xdb,9.8,CRITICAL,V3.1,True,True,3.0,13.0,...,0.99114,2023-03-04 05:28:09+00:00,0.82659,0.99578,2023-04-03 05:28:09+00:00,0.77751,0.97679,19.762674,-5.937647,12.651589
45059,CVE-2023-2779,2023-06-19 10:52:39.566000+00:00,xdb,6.1,MEDIUM,V3.1,True,True,1.0,0.0,...,,2023-07-20 00:00:00+00:00,0.0008,0.33249,2023-08-19 00:00:00+00:00,0.0008,0.33304,,0.0,
45475,CVE-2023-29849,2023-04-24 00:00:00+00:00,xdb,8.8,HIGH,V3.1,True,True,1.0,-4.0,...,,2023-05-20 00:00:00+00:00,0.00238,0.60421,2023-06-19 00:00:00+00:00,0.00238,0.60612,,0.0,


In [95]:
# df = df[df['days_to_poc_exploit'] < 0] # Before CVE publication
# df = df[df['days_to_poc_exploit'] <= 90] # Sooner than 90 days
# df = df[df['days_to_poc_exploit'] >= 0] # After CVE publication
# df = df[(df['days_to_poc_exploit'] >= 0) & (df['days_to_poc_exploit'] <= 90)] # Within 90 days
# df = df[(df['days_to_poc_exploit'] < -3000) & (df['days_to_poc_exploit'] >= -4457)]
# df = df[(df['days_to_poc_exploit'] < -2000) & (df['days_to_poc_exploit'] >= -3000)]
# df = df[(df['days_to_poc_exploit'] < -1000) & (df['days_to_poc_exploit'] >= -2000)]
# df = df[(df['days_to_poc_exploit'] < 0) & (df['days_to_poc_exploit'] >= -1000)]
# df = df[(df['days_to_poc_exploit'] < 0) & (df['days_to_poc_exploit'] >= -500)]
# df = df[(df['days_to_poc_exploit'] < 0) & (df['days_to_poc_exploit'] >= -250)]

# Compute Q1, Q3, and IQR
# Q1 = df['days_to_poc_exploit'].quantile(0.25)
# Q3 = df['days_to_poc_exploit'].quantile(0.75)
# IQR = Q3 - Q1

# # Define bounds for non-extreme values
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# # # Filter the DataFrame to exclude extreme outliers
# df = df[(df['days_to_poc_exploit'] >= lower_bound) & (df['days_to_poc_exploit'] <= upper_bound)]

main_df_results = {
    'cve_count': len(df),
    'avg_time_to_exploit': df['days_to_poc_exploit'].mean(),
    'cvss_mean': df['cvss'].mean(),
    'cvss_std': df['cvss'].std(),
    'cvss_geq_7_mean': df[df['cvss'] >= 7.0]['cvss'].mean(),
    'cvss_geq_7_std': df[df['cvss'] >= 7.0]['cvss'].std(),
    'cvss_geq_7_%_of_total': ((len(df[df['cvss'] >= 7.0]) / len(df)) * 100) if len(df) > 0 else 'No CVEs',
    'epss_0_mean': df['epss_0'].mean(),
    'epss_30_mean': df['epss_30'].mean(),
    'epss_60_mean': df['epss_60'].mean(),
    'epss_0_geq_50_%': ((len(df[df['epss_0'] >= 0.5]) / len(df)) * 100) if len(df) > 0 else 'No CVEs',
    'epss_60_geq_50_%': ((len(df[df['epss_30'] >= 0.5]) / len(df)) * 100) if len(df) > 0 else 'No CVEs',
    'epss_30_geq_50_%': ((len(df[df['epss_60'] >= 0.5]) / len(df)) * 100) if len(df) > 0 else 'No CVEs',
    'change_0_to_30_mean': df['change_0_to_30'].mean(),
    'change_30_to_60_mean': df['change_30_to_60'].mean(),
    'change_0_to_60_mean': df['change_0_to_60'].mean(),
    'change_0_to_30_std': df['change_0_to_30'].std(),
    'change_30_to_60_std': df['change_30_to_60'].std(),
    'change_0_to_60_std': df['change_0_to_60'].std()
}

pprint(main_df_results, sort_dicts=False)

{'cve_count': 56725,
 'avg_time_to_exploit': 230.40419554223638,
 'cvss_mean': 6.8755807562655615,
 'cvss_std': 1.8386021992792931,
 'cvss_geq_7_mean': 8.361715242642342,
 'cvss_geq_7_std': 0.9674306563071299,
 'cvss_geq_7_%_of_total': 49.47730277655355,
 'epss_0_mean': 0.07234982674982647,
 'epss_30_mean': 0.09939219924337961,
 'epss_60_mean': 0.10414406187624754,
 'epss_0_geq_50_%': 0.2838254737769943,
 'epss_60_geq_50_%': 0.5376817981489643,
 'epss_30_geq_50_%': 0.5817540766857646,
 'change_0_to_30_mean': 2647.9290977546016,
 'change_30_to_60_mean': 245.17023420605074,
 'change_0_to_60_mean': 3208.654115592077,
 'change_0_to_30_std': 17799.523217359983,
 'change_30_to_60_std': 5627.749377809434,
 'change_0_to_60_std': 20031.027919935084}


## Correlations

In [108]:
# Validify dataset variables
df = df[['cvss', 'epss_60', 'days_to_poc_exploit', 'exploit_count']].dropna().copy()

# Grab variables to reduce code duplication
cvss = df['cvss']
epss = df['epss_60']
days_to_exploit = df['days_to_poc_exploit']
exploit_count = df['exploit_count']

non_parametric_corr(cvss, epss, 'CVSS', 'EPSS')
non_parametric_corr(cvss, days_to_exploit, 'CVSS', 'first exploit code publication date')
non_parametric_corr(epss, days_to_exploit, 'EPSS', 'first exploit code publication date')
non_parametric_corr(cvss, exploit_count, 'CVSS', 'exploit count')
non_parametric_corr(epss, exploit_count, 'EPSS', 'exploit count')

Spearman's correlation between CVSS and EPSS: [32;1m0.26[0m | p-value: [32;1m0.00[0m
Kendall's Tau correlation between CVSS and EPSS: [32;1m0.18[0m | p-value: [32;1m0.00[0m

Spearman's correlation between CVSS and first exploit code publication date: [32;1m0.07[0m | p-value: [32;1m0.00[0m
Kendall's Tau correlation between CVSS and first exploit code publication date: [32;1m0.05[0m | p-value: [32;1m0.00[0m

Spearman's correlation between EPSS and first exploit code publication date: [32;1m0.31[0m | p-value: [32;1m0.00[0m
Kendall's Tau correlation between EPSS and first exploit code publication date: [32;1m0.21[0m | p-value: [32;1m0.00[0m

Spearman's correlation between CVSS and exploit count: [32;1m0.15[0m | p-value: [32;1m0.00[0m
Kendall's Tau correlation between CVSS and exploit count: [32;1m0.12[0m | p-value: [32;1m0.00[0m

Spearman's correlation between EPSS and exploit count: [32;1m0.16[0m | p-value: [32;1m0.00[0m
Kendall's Tau correlation betwee

# Synthetic Data

In [132]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta, date

def generate_synthetic_scada_dataset(num_rows=500):
    '''Generates a synthetic dataset for SCADA vulnerability analysis.'''

    data = {
        'CVE_ID': [f'CVE-{random.randint(2020, 2023)}-{random.randint(1000, 9999)}' for _ in range(num_rows)],
        'Publication_Date': [],
        'CVSS_Score': [round(random.uniform(0.0, 10.0), 1) for _ in range(num_rows)],
        'EPSS_Score': [round(random.uniform(0.0, 1.0), 3) for _ in range(num_rows)],
        'Exploit_Code_Date': [],
        'Time_Lag_Days': [],
        'High_CVSS': [],
        'Exploit_Within_30': [],
        'Exploit_Within_60': [],
        'Exploit_Within_90': [],
        'SCADA_Relevance': [random.choice([0, 1]) for _ in range(num_rows)],
        'IIoT_Relevance': [random.choice([0, 1]) for _ in range(num_rows)],
        'Zero_Day': [],
        'Threat_Actor': [random.choice([0, 1]) for _ in range(num_rows)],
        'Affected_Device': [],
        'Attack_Success': [],
        'Data_Source': [],
        'Notes': []
    }

    start_date = date(2020, 1, 1)
    end_date = date(2023, 12, 31)

    for i in range(num_rows):
        pub_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        data['Publication_Date'].append(pub_date)

        exploit_date = None
        time_lag = None
        exploit_within_30 = 0
        exploit_within_60 = 0
        exploit_within_90 = 0
        zero_day = 0

        if random.random() < 0.7:  # 70% chance of exploit
            lag = random.randint(-60, 180)  # Range of possible time lags
            exploit_date = pub_date + timedelta(days=lag)
            time_lag = lag

            if lag <= 30 and lag >= 0:
                exploit_within_30 = 1
            if lag <= 60 and lag >= 0:
                exploit_within_60 = 1
            if lag <= 90 and lag >= 0:
                exploit_within_90 = 1
            if lag < 0:
                zero_day = 1
        else:
            exploit_date = "Unknown"

        data['Exploit_Code_Date'].append(exploit_date)
        data['Time_Lag_Days'].append(time_lag)
        data['Exploit_Within_30'].append(exploit_within_30)
        data['Exploit_Within_60'].append(exploit_within_60)
        data['Exploit_Within_90'].append(exploit_within_90)
        data['Zero_Day'].append(zero_day)

        data['High_CVSS'].append(1 if data['CVSS_Score'][i] >= 7.0 else 0)

        device_list = ["PLC-S7-1200", "RTU-Modbus", "HMI-Panel", "Sensor-IoT", "Actuator-Wireless"]
        data['Affected_Device'].append(random.choice(device_list))

        #Attack Success Logic
        success_prob = 0.2 #base probablity
        if data['High_CVSS'][i] == 1:
            success_prob += 0.2
        if data['Exploit_Within_90'][i] == 1:
            success_prob += 0.2
        if data['SCADA_Relevance'][i] == 1:
            success_prob += 0.15
        if data['IIoT_Relevance'][i] == 1:
            success_prob += 0.15
        if data['Threat_Actor'][i] == 1:
            success_prob += 0.1
        success_prob = min(success_prob, 0.95) #cap probability at 95%
        data['Attack_Success'].append(1 if random.random() < success_prob else 0)

        data['Data_Source'].append("Synthetic Data")
        data['Notes'].append("Generated Synthetic Data")

    df = pd.DataFrame(data)
    return df

# Generate and save the dataset
df = generate_synthetic_scada_dataset(5000)

In [133]:
main_df_results = {
    'cve_count': len(df),
    'time_lag_mean': df['Time_Lag_Days'].mean(),
    'cvss_mean': df['CVSS_Score'].mean(),
    'cvss_std': df['CVSS_Score'].std(),
    'cvss_greater_than_or_equal_to_7_mean': df[df['CVSS_Score'] >= 7.0]['CVSS_Score'].mean(),
    'cvss_greater_than_or_equal_to_7_std': df[df['CVSS_Score'] >= 7.0]['CVSS_Score'].std(),
    'cvss_greater_than_or_equal_to_7_%_of_total': ((len(df[df['CVSS_Score'] >= 7.0]) / len(df)) * 100) if len(df) > 0 else 'No CVEs',
    'epss_0_mean': df['EPSS_Score'].mean(),
    'epss_greater_than_or_equal_to_50_%': ((len(df[df['EPSS_Score'] >= 0.5]) / len(df)) * 100) if len(df) > 0 else 'No CVEs',
    'exploit_within_30': (len(df[df['Exploit_Within_30'] == 1]) / len(df)) * 100,
    'exploit_within_60': (len(df[df['Exploit_Within_60'] == 1]) / len(df)) * 100,
    'exploit_within_90': (len(df[df['Exploit_Within_90'] == 1]) / len(df)) * 100,
    'scada_rel': (len(df[df['SCADA_Relevance'] == 1]) / len(df)) * 100,
    'iiot_rel': (len(df[df['IIoT_Relevance'] == 1]) / len(df)) * 100,
    'zero_day': (len(df[df['Zero_Day'] == 1]) / len(df)) * 100,
    'attack_success': (len(df[df['Attack_Success'] == 1]) / len(df)) * 100,
}

pprint(main_df_results, sort_dicts=False)

{'cve_count': 5000,
 'time_lag_mean': 59.61577898034748,
 'cvss_mean': 4.988180000000001,
 'cvss_std': 2.889204005873846,
 'cvss_greater_than_or_equal_to_7_mean': 8.463383020090733,
 'cvss_greater_than_or_equal_to_7_std': 0.8773712138229096,
 'cvss_greater_than_or_equal_to_7_%_of_total': 30.86,
 'epss_0_mean': 0.4940938,
 'epss_greater_than_or_equal_to_50_%': 49.059999999999995,
 'exploit_within_30': 9.42,
 'exploit_within_60': 18.58,
 'exploit_within_90': 27.48,
 'scada_rel': 50.22,
 'iiot_rel': 48.9,
 'zero_day': 17.080000000000002,
 'attack_success': 49.64}


In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CVE_ID             5000 non-null   object 
 1   Publication_Date   5000 non-null   object 
 2   CVSS_Score         5000 non-null   float64
 3   EPSS_Score         5000 non-null   float64
 4   Exploit_Code_Date  5000 non-null   object 
 5   Time_Lag_Days      3484 non-null   float64
 6   High_CVSS          5000 non-null   int64  
 7   Exploit_Within_30  5000 non-null   int64  
 8   Exploit_Within_60  5000 non-null   int64  
 9   Exploit_Within_90  5000 non-null   int64  
 10  SCADA_Relevance    5000 non-null   int64  
 11  IIoT_Relevance     5000 non-null   int64  
 12  Zero_Day           5000 non-null   int64  
 13  Threat_Actor       5000 non-null   int64  
 14  Affected_Device    5000 non-null   object 
 15  Attack_Success     5000 non-null   int64  
 16  Data_Source        5000 

In [130]:
# Validify dataset variables
df = df.dropna().copy()

# Grab variables to reduce code duplication
cvss = df['CVSS_Score']
epss = df['EPSS_Score']
days_to_exploit = df['Time_Lag_Days']
exploit_date = df['Exploit_Code_Date']
within_30 = df['Exploit_Within_30']
within_60 = df['Exploit_Within_60']
within_90 = df['Exploit_Within_90']
high_cvss = df['High_CVSS']
scada = df['SCADA_Relevance']
iiot = df['IIoT_Relevance']
zero = df['Zero_Day']
success = df['Attack_Success']


non_parametric_corr(cvss, epss, 'CVSS', 'EPSS')
non_parametric_corr(cvss, days_to_exploit, 'CVSS', 'first exploit code publication date')
non_parametric_corr(epss, days_to_exploit, 'EPSS', 'first exploit code publication date')
non_parametric_corr(cvss, exploit_date, 'CVSS', 'exploit date')
non_parametric_corr(epss, exploit_date, 'EPSS', 'exploit date')
non_parametric_corr(success, within_30, 'attack success', 'within 30 days of publication')
non_parametric_corr(success, within_60, 'attack success', 'within 60 days of publication')
non_parametric_corr(success, within_90, 'attack success', 'within 90 days of publication')
non_parametric_corr(epss, within_30, 'EPSS', 'within 30 days of publication')
non_parametric_corr(epss, within_60, 'CVSS', 'within 60 days of publication')
non_parametric_corr(epss, within_90, 'EPSS', 'within 90 days of publication')
non_parametric_corr(success, scada, 'attack success', 'SCADA relevance')
non_parametric_corr(success, iiot, 'attack success', 'IIoT relevance')

Spearman's correlation between CVSS and EPSS: [32;1m-0.03[0m | p-value: [32;1m0.07[0m
Kendall's Tau correlation between CVSS and EPSS: [32;1m-0.02[0m | p-value: [32;1m0.07[0m

Spearman's correlation between CVSS and first exploit code publication date: [32;1m-0.01[0m | p-value: [32;1m0.69[0m
Kendall's Tau correlation between CVSS and first exploit code publication date: [32;1m-0.00[0m | p-value: [32;1m0.69[0m

Spearman's correlation between EPSS and first exploit code publication date: [32;1m-0.00[0m | p-value: [32;1m0.94[0m
Kendall's Tau correlation between EPSS and first exploit code publication date: [32;1m-0.00[0m | p-value: [32;1m0.94[0m

Spearman's correlation between CVSS and exploit date: [32;1m-0.02[0m | p-value: [32;1m0.33[0m
Kendall's Tau correlation between CVSS and exploit date: [32;1m-0.01[0m | p-value: [32;1m0.33[0m

Spearman's correlation between EPSS and exploit date: [32;1m0.02[0m | p-value: [32;1m0.15[0m
Kendall's Tau correlation b