In [1]:
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from copy import copy

In [2]:
sources_with_data_text = os.path.join('data', 'sources_with_data.txt')
with open (sources_with_data_text, mode='r') as f:
    lines = f.readlines()
    
#check we closed the file
assert f.closed


#strip the spaces at the end
lines = [l.strip() for l in lines]
#keep only CVEs and drop the rest
lines = [l for l in lines if 'CVE' in l]

unique_cve = (set(lines))

print("Found {} unique CVEs in {}".format(len(unique_cve), sources_with_data_text))

Found 152 unique CVEs in data/sources_with_data.txt


In [3]:
def load_obj(path ):
    with open(path, 'rb') as f:
        return pickle.load(f)


In [4]:
#create list of dicts
broadcom_arr=[]
for file in tqdm(glob.glob('broadcom_dicts/*.pkl')):
    obj = load_obj(file)
    #if array is not empty
    if obj['CVE']:
        broadcom_arr.extend(obj['CVE'])
    

broadcom_cve = (set(broadcom_arr))

print("Found {} unique CVEs in {}".format(len(broadcom_cve), 'broadcom dicts'))

100%|██████████| 6609/6609 [00:00<00:00, 13552.40it/s]

Found 1406 unique CVEs in broadcom dicts





In [57]:
cve_in_wild = copy(broadcom_cve)
cve_in_wild.update(unique_cve)
#cve_in_wild = list(cve_in_wild)
print("Found {} unique CVEs overll".format(len(cve_in_wild)))

Found 1479 unique CVEs overll


In [58]:
#fix some inconsistencies in data collection
#manual fixes
cve_in_wild = [cve.replace('1)', '') for cve in cve_in_wild]
cve_in_wild = [cve.replace('service', '') for cve in cve_in_wild]
cve_in_wild = [cve.replace('3)', '') for cve in cve_in_wild]
cve_in_wild = [cve.replace('_3', '') for cve in cve_in_wild]
cve_in_wild = [cve for cve in cve_in_wild if len(cve)>=11]
cve_in_wild = [cve.replace('(', '') for cve in cve_in_wild]
cve_in_wild = [cve.replace(')', '') for cve in cve_in_wild]


## more manual fixes to corrupted data
cve_in_wild = [cve.replace('CVE2019-7278', 'CVE-2019-7278') for cve in cve_in_wild]
cve_in_wild = [cve.replace('2CVE-2006-3643', 'CVE-2006-3643') for cve in cve_in_wild]
cve_in_wild = [cve.replace('CVE2019-7279', 'CVE-2019-7279') for cve in cve_in_wild]
cve_in_wild = [cve.replace('CVE-2018_16858', 'CVE-2018-16858') for cve in cve_in_wild]
cve_in_wild = [cve.replace('CVE 2014-6278', 'CVE-2014-6278') for cve in cve_in_wild]
cve_in_wild = [cve.replace('CVE-209-18935', 'CVE-2019-18935') for cve in cve_in_wild]
cve_in_wild = [cve.replace('CVE_2009-3729', 'CVE-2009-3729') for cve in cve_in_wild]
cve_in_wild = [cve.replace('CVE-20190-11539', 'CVE-2019-11539') for cve in cve_in_wild]
cve_in_wild = [cve.replace('CVE-2190-11539', 'CVE-2019-11539') for cve in cve_in_wild]





In [59]:
dates = set([x.split('-')[1] for x in cve_in_wild])

In [60]:
for x in cve_in_wild:
    if '2190' in x:
        print(x)

In [61]:
dates

{'1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020'}

In [62]:
print("First exploit was recorded in {}".format(min(dates)))

First exploit was recorded in 1999


In [63]:
print("Last exploit was recorded in {}".format(max(dates)))

Last exploit was recorded in 2020


In [12]:
target_cve_dict = {}
df_nvd = pd.read_csv(os.path.join('data', 'nvdcve_combined.csv'))
for cve in df_nvd['ID']:
    if cve in cve_in_wild:
        target_cve_dict[cve] = 1
    else:
        target_cve_dict[cve] = 0
        
df_target = pd.DataFrame.from_dict(target_cve_dict, orient='index', columns=['in_the_wild'])
df_target['ID'] = df_target.index
df_target = df_target.reset_index(drop=True)

#rearrange
df_target = df_target[['ID', 'in_the_wild']]

In [13]:
df_target.head()

Unnamed: 0,ID,in_the_wild
0,CVE-1999-0001,0
1,CVE-1999-0002,0
2,CVE-1999-0003,0
3,CVE-1999-0004,0
4,CVE-1999-0005,0


In [14]:
df_target['in_the_wild'].mean()

0.009605271916394861

In [15]:
dates = []