# Preliminary Analysis
This notebook is dedicated to answering questions applied to various filtrations of the focused dataset.

In [223]:
import os
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plot
import seaborn as sbn
from pprint import pprint
from typing import Dict, Tuple

# Import utility functions
src_path = os.path.abspath(os.path.join('..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from utils import *
from classes import Plotter

df = pd.read_parquet(path='../data/processed/composite/dataset_focused.parquet')

## Means & Deviations

In [224]:
# df = df[df['days_to_poc_exploit'] < 0] # Before CVE publication
# df = df[df['days_to_poc_exploit'] <= 90] # Sooner than 90 days
# df = df[df['days_to_poc_exploit'] >= 0] # After CVE publication
# df = df[(df['days_to_poc_exploit'] >= 0) & (df['days_to_poc_exploit'] <= 90)] # Within 90 days
# df = df[(df['days_to_poc_exploit'] < -3000) & (df['days_to_poc_exploit'] >= -4457)] # Within 90 days
# df = df[(df['days_to_poc_exploit'] < -2000) & (df['days_to_poc_exploit'] >= -3000)] # Within 90 days
# df = df[(df['days_to_poc_exploit'] < -1000) & (df['days_to_poc_exploit'] >= -2000)] # Within 90 days
# df = df[(df['days_to_poc_exploit'] < 0) & (df['days_to_poc_exploit'] >= -1000)] # Within 90 days
# df = df[(df['days_to_poc_exploit'] < 0) & (df['days_to_poc_exploit'] >= -500)] # Within 90 days
# df = df[(df['days_to_poc_exploit'] < 0) & (df['days_to_poc_exploit'] >= -250)] # Within 90 days

# # Compute Q1, Q3, and IQR
# Q1 = df['days_to_poc_exploit'].quantile(0.25)
# Q3 = df['days_to_poc_exploit'].quantile(0.75)
# IQR = Q3 - Q1

# # Define bounds for non-extreme values
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# # Filter the DataFrame to exclude extreme outliers
# df = df[(df['days_to_poc_exploit'] >= lower_bound) & (df['days_to_poc_exploit'] <= upper_bound)]

main_df_results = {
    'avg_time_to_exploit': df['days_to_poc_exploit'].mean(),
    'cvss_mean': df['cvss'].mean(),
    'cvss_std': df['cvss'].std(),
    'cvss_geq_7_mean': df[df['cvss'] >= 7.0]['cvss'].mean(),
    'cvss_geq_7_std': df[df['cvss'] >= 7.0]['cvss'].std(),
    'cvss_geq_7_%_of_total': ((len(df[df['cvss'] >= 7.0]) / len(df)) * 100) if len(df) > 0 else 'No CVEs',
    'epss_0_mean': df['epss_0'].mean(),
    'epss_30_mean': df['epss_30'].mean(),
    'epss_60_mean': df['epss_60'].mean(),
    'epss_0_geq_50_%': ((len(df[df['epss_0'] >= 0.5]) / len(df)) * 100) if len(df) > 0 else 'No CVEs',
    'epss_60_geq_50_%': ((len(df[df['epss_30'] >= 0.5]) / len(df)) * 100) if len(df) > 0 else 'No CVEs',
    'epss_30_geq_50_%': ((len(df[df['epss_60'] >= 0.5]) / len(df)) * 100) if len(df) > 0 else 'No CVEs'
}

pprint(main_df_results, sort_dicts=False)

{'avg_time_to_exploit': 230.40419554223638,
 'cvss_mean': 7.36043731778426,
 'cvss_std': 1.7935989475278031,
 'cvss_geq_7_mean': 8.435699373695197,
 'cvss_geq_7_std': 1.0071936314216556,
 'cvss_geq_7_%_of_total': 61.573776512716925,
 'epss_0_mean': 0.07234982674982679,
 'epss_30_mean': 0.09939219924337969,
 'epss_60_mean': 0.10414406187624757,
 'epss_0_geq_50_%': 1.4782848223303646,
 'epss_60_geq_50_%': 2.8004774584519327,
 'epss_30_geq_50_%': 3.030024791111927}


## Correlations

In [194]:
# Validify dataset variables
df = df[['cvss', 'epss_30', 'days_to_poc_exploit', 'exploit_count']].dropna().copy()

# Grab variables to reduce code duplication
cvss = df['cvss']
epss = df['epss_30']
days_to_exploit = df['days_to_poc_exploit']
exploit_count = df['exploit_count']

non_parametric_corr(cvss, epss, 'CVSS', 'EPSS')
non_parametric_corr(cvss, days_to_exploit, 'CVSS', 'first exploit code publication date')
non_parametric_corr(epss, days_to_exploit, 'EPSS', 'first exploit code publication date')
non_parametric_corr(cvss, exploit_count, 'CVSS', 'exploit count')
non_parametric_corr(epss, exploit_count, 'EPSS', 'exploit count')

Spearman's correlation between CVSS and EPSS: [32;1m0.19[0m | p-value: [32;1m0.00[0m
Kendall's Tau correlation between CVSS and EPSS: [32;1m0.13[0m | p-value: [32;1m0.00[0m

Spearman's correlation between CVSS and first exploit code publication date: [32;1m0.07[0m | p-value: [32;1m0.06[0m
Kendall's Tau correlation between CVSS and first exploit code publication date: [32;1m0.05[0m | p-value: [32;1m0.05[0m

Spearman's correlation between EPSS and first exploit code publication date: [32;1m0.17[0m | p-value: [32;1m0.00[0m
Kendall's Tau correlation between EPSS and first exploit code publication date: [32;1m0.12[0m | p-value: [32;1m0.00[0m

Spearman's correlation between CVSS and exploit count: [32;1m0.27[0m | p-value: [32;1m0.00[0m
Kendall's Tau correlation between CVSS and exploit count: [32;1m0.22[0m | p-value: [32;1m0.00[0m

Spearman's correlation between EPSS and exploit count: [32;1m0.28[0m | p-value: [32;1m0.00[0m
Kendall's Tau correlation betwee