# Wrangle

In [1]:
#imports
import pandas as pd
import numpy as np

# system
import sys
# sys.path.append("../data")
sys.path.append("util_")
import wrangle_

In [2]:
# lacd data
df = pd.read_csv("data/known_exploited_vulnerabilities.csv")
df.head()

Unnamed: 0,cveID,vendorProject,product,vulnerabilityName,dateAdded,shortDescription,requiredAction,dueDate,knownRansomwareCampaignUse,notes,cwes
0,CVE-2021-27104,Accellion,FTA,Accellion FTA OS Command Injection Vulnerability,2021-11-03,Accellion FTA contains an OS command injection...,Apply updates per vendor instructions.,2021-11-17,Known,,
1,CVE-2021-27102,Accellion,FTA,Accellion FTA OS Command Injection Vulnerability,2021-11-03,Accellion FTA contains an OS command injection...,Apply updates per vendor instructions.,2021-11-17,Known,,
2,CVE-2021-27101,Accellion,FTA,Accellion FTA SQL Injection Vulnerability,2021-11-03,Accellion FTA contains a SQL injection vulnera...,Apply updates per vendor instructions.,2021-11-17,Known,,
3,CVE-2021-27103,Accellion,FTA,Accellion FTA Server-Side Request Forgery (SSR...,2021-11-03,Accellion FTA contains a server-side request f...,Apply updates per vendor instructions.,2021-11-17,Known,,
4,CVE-2021-21017,Adobe,Acrobat and Reader,Adobe Acrobat and Reader Heap-based Buffer Ove...,2021-11-03,Acrobat Acrobat and Reader contain a heap-base...,Apply updates per vendor instructions.,2021-11-17,Unknown,,


In [3]:
# look at the shape
df.shape

(1140, 11)

In [4]:
#look at the columns
df.columns

Index(['cveID', 'vendorProject', 'product', 'vulnerabilityName', 'dateAdded',
       'shortDescription', 'requiredAction', 'dueDate',
       'knownRansomwareCampaignUse', 'notes', 'cwes'],
      dtype='object')

In [5]:
# check the full info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1140 entries, 0 to 1139
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   cveID                       1140 non-null   object 
 1   vendorProject               1140 non-null   object 
 2   product                     1140 non-null   object 
 3   vulnerabilityName           1140 non-null   object 
 4   dateAdded                   1140 non-null   object 
 5   shortDescription            1140 non-null   object 
 6   requiredAction              1140 non-null   object 
 7   dueDate                     1140 non-null   object 
 8   knownRansomwareCampaignUse  1140 non-null   object 
 9   notes                       373 non-null    object 
 10  cwes                        0 non-null      float64
dtypes: float64(1), object(10)
memory usage: 98.1+ KB


In [6]:
# select numeric
len(df.select_dtypes("number").columns) # count of numeric columns

1

In [7]:
#select object columns
len(df.select_dtypes("object").columns) #count of object columns

10

In [8]:
df.describe()

Unnamed: 0,cwes
count,0.0
mean,
std,
min,
25%,
50%,
75%,
max,


**What I see:**


- I have 1140 rows and 11columns
- 1 of the 11 columns are numric while 10 of them are string object colums
- the numeric column `cwes` is null
- the `notes` column has 373 non null rows
- descriptive statistics says nothing on the `cwes` empty column

In [9]:
# remove the non useul unique id and the emplt column
df = df.drop(columns=["cveID", "dateAdded","cwes"])
df.columns

Index(['vendorProject', 'product', 'vulnerabilityName', 'shortDescription',
       'requiredAction', 'dueDate', 'knownRansomwareCampaignUse', 'notes'],
      dtype='object')

In [10]:
# nanipulate the column names to a clear name
spaced_cols = []
col_renamed = ""
for name in df.columns:
    for ele in name:
        if ele == ele.upper():
            col_renamed += " " + ele
        else:
            col_renamed += ele
    spaced_cols.append(col_renamed)
    col_renamed = ""

df.columns = spaced_cols
spaced_cols

['vendor Project',
 'product',
 'vulnerability Name',
 'short Description',
 'required Action',
 'due Date',
 'known Ransomware Campaign Use',
 'notes']

In [11]:
# remove spaces from the ends
# replace " " with _
# convert from lower case
new_cols = df.columns.str.strip().str.replace(" ", "_").str.lower()
df.columns = new_cols #replace the original columns with the new columns
df.columns

Index(['vendor_project', 'product', 'vulnerability_name', 'short_description',
       'required_action', 'due_date', 'known_ransomware_campaign_use',
       'notes'],
      dtype='object')

In [12]:
# remove duplicated rows
print("original data size: ", df.shape)
df = df.drop_duplicates(keep="first")
print("new data size: ", df.shape)

original data size:  (1140, 8)
new data size:  (1099, 8)


In [13]:
# Split the data into 3 sets
train, validate, test = wrangle_.split_data_(df=df, 
                     test_size=0.2, 
                     validate_size= 0.2, 
                     stratify_col="known_ransomware_campaign_use", 
                     random_state=10)
train.shape, validate.shape, test.shape

((659, 8), (220, 8), (220, 8))

In [14]:
wrangle_.save_original_data(df=train, folder="./data", file_name="temp_train_data_to_be_removed")

'File temp_train_data_to_be_removed saved'

In [15]:
# reset index
train = train.reset_index(drop=True)

In [16]:
# count values for each feature
for col in train.columns:
    print(train[col].value_counts)
    print("unique value count size:",train[col].value_counts().shape)
    print("\n\n")

<bound method IndexOpsMixin.value_counts of 0          Zimbra
1          Google
2      SolarWinds
3         Mozilla
4           Rails
          ...    
654         Adobe
655     Hikvision
656     Microsoft
657         Cisco
658        Google
Name: vendor_project, Length: 659, dtype: object>
unique value count size: (121,)



<bound method IndexOpsMixin.value_counts of 0                         Collaboration (ZCS)
1                                 Chromium V8
2                                      Serv-U
3                     Firefox and Thunderbird
4                               Ruby on Rails
                        ...                  
654                                ColdFusion
655               Security cameras web server
656                         Internet Explorer
657    IOS Software and Cisco IOS XE Software
658                               Chromium V8
Name: product, Length: 659, dtype: object>
unique value count size: (308,)



<bound method IndexOpsMixin.value_counts of 0

# Tasks from EDA

* The source of memory corruption is from 4 vendors, but it looks like it's happening on different products of these vendors, so I will go back inthe wrangle file and separate the product from the volunerability and create a additional column for specific vulnerabilities groupd by wording. (cell 7 - cell 15)

In [17]:
train_temp = train.copy()

In [18]:
# # Privilege Escalation Vulnerability  
# search_terms = ["Privilege", "Escalation"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [19]:
# # Memory Corruption Vulnerability
# search_terms = ["Memory", "Corruption"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [20]:
# # Arbitrary File Upload Vulnerability
# search_terms = ["File", "Upload"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [21]:
# # Remote/Arbitrary Code Execution Vulnerability 
# search_terms = ["Execution", "Remote", "Arbitrary"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [22]:
# # Improper Access Control Vulnerability
# search_terms = ["Access", "Improper", "Control"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [23]:
# # Command Injection Vulnerability
# search_terms = ["Injection", "Command", "SQL"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [24]:
# # Use-After-Free Vulnerability 
# search_terms = ["Use-After", "Use-After-Free"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [25]:
# # Deserialization of Untrusted Data Vulnerability
# search_terms = ["Deserialization", "Untrusted"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [26]:
# # Type Confusion Vulnerability  
# search_terms = ["Type Confusion"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [27]:
# # Path Traversal Vulnerability
# search_terms = ["Traversal", "path"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [28]:
# # Buffer Overflow Vulnerability
# search_terms = ["Buffer", "Overflow", "Heap"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [29]:
# # Denial of Service Vulnerability 
# search_terms = ["Denial", "Service"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [30]:
# # Information Disclosure Vulnerability
# search_terms = ["Information", "Disclosure"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [31]:
# # Out-of-Bounds Write Vulnerability
# search_terms = ["Out-of-Bounds", "Write", "Bounds"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [32]:
# # Sandbox Bypass Vulnerability 
# search_terms = ["Sandbox", "Bypass"]
# pattern = '|'.join(search_terms)

# len(train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)])

In [33]:
# # Arbitrary File Upload Vulnerability
# search_terms = ["Unspecified"]
# pattern = '|'.join(search_terms)

# train.vulnerability_name[train['vulnerability_name'].str.contains(pattern, case=False, na=False)]

#### I cam combine all of the obove conditions into one as I engineer a new feature for vulnerabilities

In [34]:
# Create a function that handles feature engineering for vulnerability names
def feature_eng_vul_names(data_frame):
    # Define the conditions
    conditions = [
        data_frame['vulnerability_name'].str.contains(r'Privilege|Escalation', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Memory|Corruption', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'File|Upload', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Remote Code', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Access|Improper|Control', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Injection|Command|SQL', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Use-After|Use-After-Free', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Deserialization|Untrusted', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Type Confusion', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Traversal|path', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Buffer|Overflow|Heap', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Denial|Service', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Information|Disclosure', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Out-of-Bounds|Write|Bounds', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Sandbox|Bypass', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Unspecified', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Spoofing', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Input|Validation', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Hard-Coded|Credentials', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Forgery|SSRF|Request|Server', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Scripting|XSS|Cross|Site', case=False, na=False),
        data_frame['vulnerability_name'].str.contains(r'Arbitrary Code', case=False, na=False)
    ]

    # Define the corresponding values for the new feature
    choices = [
        "Privilege Escalation Vulnerability",
        "Memory Corruption Vulnerability",
        "File Upload Vulnerability",
        "Remote Code Execution Vulnerability",
        "Improper Access Control Vulnerability",
        "Command Injection Vulnerability",
        "Use-After-Free Vulnerability",
        "Deserialization of Untrusted Data Vulnerability",
        "Type Confusion Vulnerability",
        "Path Traversal Vulnerability",
        "Buffer Overflow Vulnerability",
        "Denial of Service Vulnerability",
        "Information Disclosure Vulnerability",
        "Out-of-Bounds Write Vulnerability",
        "Sandbox Bypass Vulnerability",
        "Arbitrary File Upload Vulnerability",
        "Spoofing Vulnerabilities",
        "Input Validation Vulnerabilities",
        "Use of Hard-Coded Credentials Vulnerabilities",
        "Server-Side Request Forgery (SSRF) Vulnerabilities",
        "Cross-Site Scripting (XSS) Vulnerabilities",
        "Arbitrary Code Execution Vulnerability",
    ]

    # Use np.select to create the new feature
    data_frame['vulnerability_type'] = np.select(conditions, choices, default="Other Vulnerability")
    return data_frame


#### The newly engineered feature values

In [35]:
feature_eng_vul_names(train).vulnerability_type.value_counts()

vulnerability_type
Privilege Escalation Vulnerability                    93
Remote Code Execution Vulnerability                   90
Command Injection Vulnerability                       59
Improper Access Control Vulnerability                 46
Sandbox Bypass Vulnerability                          43
Memory Corruption Vulnerability                       39
Buffer Overflow Vulnerability                         39
Use-After-Free Vulnerability                          35
Other Vulnerability                                   34
Path Traversal Vulnerability                          24
Information Disclosure Vulnerability                  23
File Upload Vulnerability                             20
Out-of-Bounds Write Vulnerability                     18
Denial of Service Vulnerability                       18
Deserialization of Untrusted Data Vulnerability       16
Type Confusion Vulnerability                          15
Arbitrary File Upload Vulnerability                   12
Cross-Site S

**Let know look how we can engineer the `required_action` column**

In [36]:
train.required_action.value_counts()

required_action
Apply updates per vendor instructions.                                                                                                                                                                                                                                                                                                                                               505
Apply mitigations per vendor instructions or discontinue use of the product if mitigations are unavailable.                                                                                                                                                                                                                                                                           93
The impacted product is end-of-life and should be disconnected if still in use.                                                                                                                                                       

I see that there is only 15 options that represent the entire training dataset, howerver they are too wordy and I don;t like that, so I will engineer a new column withshort and specific identifiers for each required action .

In [37]:
# Create a function that handles feature engineering for vulnerability names
def feature_eng_req_actions(data_frame):
#     Create an array to contain the required actions
    actions = np.array(data_frame.required_action.value_counts().keys())
    # Define the conditions
    conditions = [
        data_frame['required_action'] == actions[0],
        data_frame['required_action'] == actions[1],
        data_frame['required_action'] == actions[2],
        data_frame['required_action'] == actions[3],
        data_frame['required_action'] == actions[4],
        data_frame['required_action'] == actions[5],
        data_frame['required_action'] == actions[6],
        data_frame['required_action'] == actions[7],
        data_frame['required_action'] == actions[8],
        data_frame['required_action'] == actions[9],
        data_frame['required_action'] == actions[10],
        data_frame['required_action'] == actions[11],
        data_frame['required_action'] == actions[12],
        data_frame['required_action'] == actions[13],
        data_frame['required_action'] == actions[14]
    ]

    # Define the corresponding values for the new feature
    choices = [
        "Apply Updates", 
        "Apply Mitigations or Discontinue", 
        "End of Life Disconnect",  
        "Apply Updates or Discontinue",  
        "Apply Remediations or Discontinue",  
        "Apply Updates or Discontinue",  
        "Legacy Disconnect",  
        "Verify Cisco Compliance",  
        "Mitigate GroupLock or Discontinue",  
        "End of Life Disconnect",  
        "Mitigate or Disable SLP",  
        "Log4j Remediation",  
        "Internet Block and Update",  
        "Enable Threat Prevention",  
        "DLink Update or Disconnect",
    ]

    # Use np.select to create the new feature
    data_frame['required_action_new'] = np.select(conditions, choices)
    return data_frame


In [38]:
feature_eng_req_actions(train).required_action_new.value_counts()

required_action_new
Apply Updates                        505
Apply Mitigations or Discontinue      93
End of Life Disconnect                32
Apply Updates or Discontinue          12
Apply Remediations or Discontinue      6
Legacy Disconnect                      3
Verify Cisco Compliance                2
Mitigate GroupLock or Discontinue      1
Mitigate or Disable SLP                1
Log4j Remediation                      1
Internet Block and Update              1
Enable Threat Prevention               1
DLink Update or Disconnect             1
Name: count, dtype: int64

In [39]:
train.required_action.value_counts().values

array([505,  93,  31,   8,   6,   4,   3,   2,   1,   1,   1,   1,   1,
         1,   1])

**Now lets look into values contained in products**

In [67]:
train["product"].value_counts()

product
Windows                              73
Multiple Products                    33
Flash Player                         21
Internet Explorer                    19
Chromium V8                          15
                                     ..
DirectX Graphics Kernel (DXGKRNL)     1
UCM6200                               1
HTTP File Server (HFS)                1
Multiple Vigor Routers                1
Security cameras web server           1
Name: count, Length: 308, dtype: int64

In [71]:
product_groups = {
    "Operating Systems": [
        "Windows",
        "OS X",
        "macOS",
        "Android Kernel",
        "Solaris",
        "IOS Software",
        "IOS and IOS XE",
        "IOS, IOS XR, and IOS XE",
        "IOS, XR, and XE Software",
        "iOS",
        "iOS, iPadOS, and watchOS",
        "iOS, iPadOS, and macOS",
        "iOS and macOS",
        "iOS and iPadOS",
        "XP",
        "Ubuntu",
        "Debian-specific Redis Servers"
    ],
    "Microsoft Products": [
        "Office",
        "Excel",
        "PowerPoint",
        "Windows COM+ Event System Service",
        ".NET Framework",
        ".NET Core and Visual Studio",
        "MSHTML",
        "Active Directory",
        "Hyper-V RemoteFX",
        "Exchange Server",
        "Internet Explorer",
        "Windows Kernel",
        "Skype for Business",
        "Defender",
        "Task Scheduler",
        "Windows Server",
        "Silverlight",
        "Internet Information Services (IIS)",
        "Azure",
        "Microsoft Visual Studio",
        "Active Management Technology (AMT), Small Business Technology (SBT), and Standard Manageability"
    ],
    "Browsers": [
        "Chromium",
        "Firefox",
        "Firefox and Thunderbird",
        "Chromium V8",
        "Chromium Network Service",
        "Chromium WebP",
        "Chromium Visuals",
        "Chromium Intents",
        "Chromium Portals",
        "Chromium WebRTC",
        "Chrome FreeType",
        "Chrome WebAudio",
        "Edge",
        "Edge and Internet Explorer",
        "Internet Explorer Scripting Engine",
        "Internet Explorer"
    ],
    "Adobe Products": [
        "Flash Player",
        "Acrobat and Reader",
        "ColdFusion",
        "Flash Player and AIR",
        "Reader and Acrobat"
    ],
    "Networking": [
        "Adaptive Security Appliance (ASA)",
        "Adaptive Security Appliance (ASA) and Firepower Threat Defense (FTD)",
        "NetScaler ADC and NetScaler Gateway",
        "AnyConnect Secure",
        "IOS and IOS XE Software",
        "IOS XR",
        "FortiOS",
        "FortiOS and FortiADC",
        "FortiOS and FortiProxy",
        "SSLVPN SMA100",
        "VPN Routers",
        "RouterOS",
        "NX-OS",
        "SD-WAN Edge",
        "SD-WAN and NetScaler",
        "WebRTC",
        "Windows SMBv1",
        "SMBv1 server",
        "SonicWall Email Security",
        "Secure Remote Access (SRA)",
        "SSLVPN SMA100",
        "SSLVPN SMA200",
        "Firebox and XTM Appliances",
        "WebSphere Application Server and Server Hypervisor Edition",
        "ProCurve Manager (PCM), PCM+, Identity Driven Manager (IDM), and Application Lifecycle Management"
    ],
    "Cisco Products": [
        "Adaptive Security Appliance (ASA)",
        "Adaptive Security Appliance (ASA) and Firepower Threat Defense (FTD)",
        "IOS and IOS XE Software",
        "IOS XR",
        "NetScaler ADC and NetScaler Gateway",
        "AnyConnect Secure",
        "VPN Routers",
        "RouterOS",
        "NX-OS",
        "SD-WAN Edge",
        "SD-WAN and NetScaler",
        "WebRTC",
        "Secure Remote Access (SRA)"
    ],
    "Security": [
        "FortiOS",
        "FortiOS and FortiADC",
        "FortiOS and FortiProxy",
        "SSLVPN SMA100",
        "SSLVPN SMA200",
        "Windows SMBv1",
        "SMBv1 server",
        "SonicWall Email Security",
        "Secure Remote Access (SRA)",
        "Firebox and XTM Appliances"
    ],
    "Development and Frameworks": [
        "Java SE",
        "DotNetNuke (DNN)",
        "Apache Struts",
        "Struts 1",
        "ActiveMQ",
        "PHP",
        "PHP FastCGI Process Manager (FPM)",
        "Ruby on Rails",
        "Log4j2",
        "Telerik UI for ASP.NET AJAX",
        ".NET Framework, SharePoint, Visual Studio",
        "MSHTML",
        "Graphics Component",
        "CouchDB",
        "Spring Cloud Gateway",
        "Spring Framework"
    ],
    "Web Servers and Middleware": [
        "vCenter Server",
        "JBoss",
        "Tomcat",
        "Nagios XI",
        "Log4j2",
        "Spring Cloud",
        "WebSphere Application Server and Server Hypervisor Edition",
        "ProCurve Manager (PCM), PCM+, Identity Driven Manager (IDM), and Application Lifecycle Management",
        "Fusion Middleware"
    ],
    "Virtualization and Cloud": [
        "VMware ESXi",
        "vCenter Server",
        "HyperFlex HX",
        "BIG-IP and BIG-IQ Centralized Management",
        "Aria Operations for Networks",
        "Workspace ONE Access and Identity Manager",
        "Virtualization Manager",
        "Application Delivery Controller (ADC), Gateway, and SD-WAN WANOP Appliance",
        "Nexus Repository Manager"
    ],
    "Database": [
        "SQL Server",
        "Exim",
        "MySQL",
        "PostgreSQL",
        "Oracle Database",
        "MSSQL Server",
        "MongoDB",
        "Elasticsearch",
        "Redis",
        "CouchDB",
        "Solr"
    ],
    "Storage": [
        "Network Attached Storage (NAS)",
        "Multiple NAS Devices",
        "QNAP Network-Attached Storage (NAS)",
        "Network Attached Storage (NAS)"
    ],
    "Miscellaneous": [
        "Multiple Products",
        "Multiple Devices",
        "Multiple Chipsets",
        "Multiple Firewalls",
        "Streaming Service",
        "Roundcube Webmail",
        "Apex Central",
        "Symantec Messaging Gateway",
        "EyesOfNetwork",
        "Exim Internet Mailer",
        "Junos OS",
        "Web Appliance",
        "Tools",
        "Viewer",
        "Cobalt Strike",
        "Sense",
        "Sudo",
        "dotCMS",
        "SysAid Server",
        "ManageEngine ServiceDesk Plus (SDP)",
        "MinIO",
        "Security cameras web server",
        "UCM6200",
        "Ancillary Function Driver (afd.sys)"
    ]
}

In [74]:
def map_product_groups(product):
    for key, value in product_groups.items():
        if product in value:
            return key
    return "Other"

train["product"].apply(map_product_groups)

0      Other
1      Other
2      Other
3      Other
4      Other
       ...  
654    Other
655    Other
656    Other
657    Other
658    Other
Name: product, Length: 659, dtype: object

In [73]:
train["product"].apply(map_product_groups).value_counts()


product
Other    659
Name: count, dtype: int64

In [43]:
# Create a reverse mapping for quick lookup
reverse_mapping = {product: category for category, products in product_groups.items() for product in train["product"]}
reverse_mapping
# # Function to map a product to its category
# def map_to_category(product):
#     return reverse_mapping.get(product, "Unknown")

# # Apply the function to create a new column 'Category'
# train['product_group'] = train['product'].apply(map_to_category)

# train

{'Collaboration (ZCS)': 'Miscellaneous',
 'Chromium V8': 'Miscellaneous',
 'Serv-U': 'Miscellaneous',
 'Firefox and Thunderbird': 'Miscellaneous',
 'Ruby on Rails': 'Miscellaneous',
 'Windows': 'Miscellaneous',
 'Data Risk Manager': 'Miscellaneous',
 'Adaptive Security Appliance (ASA) and Firepower Threat Defense (FTD)': 'Miscellaneous',
 'iOS and macOS': 'Miscellaneous',
 'Content Collaboration': 'Miscellaneous',
 'Chromium WebGL': 'Miscellaneous',
 'Customer Relationship Management (CRM)': 'Miscellaneous',
 'vCenter Server': 'Miscellaneous',
 'Word': 'Miscellaneous',
 'MSHTML': 'Miscellaneous',
 'SonicWall Email Security': 'Miscellaneous',
 'IOS and IOS XE Software': 'Miscellaneous',
 'PowerPoint': 'Miscellaneous',
 'Acrobat and Reader': 'Miscellaneous',
 'HTTP.sys': 'Miscellaneous',
 'Firefox': 'Miscellaneous',
 'Script Security Plugin': 'Miscellaneous',
 'Virtualization Manager': 'Miscellaneous',
 'EyesOfNetwork': 'Miscellaneous',
 'Webmin': 'Miscellaneous',
 'vCenter Server and Cl

In [115]:
train.product_group.value_counts()

product_group
Miscellaneous    659
Name: count, dtype: int64

**Apply new changes to the entire dataframe**

In [40]:
df = feature_eng_vul_names(df)
df = feature_eng_req_actions(df)
df.head(3)

Unnamed: 0,vendor_project,product,vulnerability_name,short_description,required_action,due_date,known_ransomware_campaign_use,notes,vulnerability_type,required_action_new
0,Accellion,FTA,Accellion FTA OS Command Injection Vulnerability,Accellion FTA contains an OS command injection...,Apply updates per vendor instructions.,2021-11-17,Known,,Command Injection Vulnerability,Apply Updates
1,Accellion,FTA,Accellion FTA OS Command Injection Vulnerability,Accellion FTA contains an OS command injection...,Apply updates per vendor instructions.,2021-11-17,Known,,Command Injection Vulnerability,Apply Updates
2,Accellion,FTA,Accellion FTA SQL Injection Vulnerability,Accellion FTA contains a SQL injection vulnera...,Apply updates per vendor instructions.,2021-11-17,Known,,Command Injection Vulnerability,Apply Updates


In [41]:
# remove columns not needed
df = df.drop(columns=["notes", "short_description", "vulnerability_name", "due_date", "required_action"])
df.head(3)

Unnamed: 0,vendor_project,product,known_ransomware_campaign_use,vulnerability_type,required_action_new
0,Accellion,FTA,Known,Command Injection Vulnerability,Apply Updates
1,Accellion,FTA,Known,Command Injection Vulnerability,Apply Updates
2,Accellion,FTA,Known,Command Injection Vulnerability,Apply Updates


### Dummies

Create dummy variables for modeling

In [42]:
# First create a copy of the original dataframe
df_dummy = df.copy()

In [43]:
# get all the columns to upply dummies to
dummy_col = df_dummy.columns[df_dummy.columns != "known_ransomware_campaign_use"]

df_dummy[dummy_col].head(3)

Unnamed: 0,vendor_project,product,vulnerability_type,required_action_new
0,Accellion,FTA,Command Injection Vulnerability,Apply Updates
1,Accellion,FTA,Command Injection Vulnerability,Apply Updates
2,Accellion,FTA,Command Injection Vulnerability,Apply Updates


In [44]:
df_dummy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1099 entries, 0 to 1139
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   vendor_project                 1099 non-null   object
 1   product                        1099 non-null   object
 2   known_ransomware_campaign_use  1099 non-null   object
 3   vulnerability_type             1099 non-null   object
 4   required_action_new            1099 non-null   object
dtypes: object(5)
memory usage: 51.5+ KB


In [45]:
# Generate dummy variables for modeling
df_dummy = pd.get_dummies(df_dummy[dummy_col], drop_first=True).astype(int)
df_dummy.head(3)

Unnamed: 0,vendor_project_Acronis,vendor_project_Adobe,vendor_project_Alcatel,vendor_project_Amcrest,vendor_project_Android,vendor_project_Apache,vendor_project_Apple,vendor_project_Arcadyan,vendor_project_Arcserve,vendor_project_Arm,...,required_action_new_Apply Updates or Discontinue,required_action_new_DLink Update or Disconnect,required_action_new_Enable Threat Prevention,required_action_new_End of Life Disconnect,required_action_new_Internet Block and Update,required_action_new_Legacy Disconnect,required_action_new_Log4j Remediation,required_action_new_Mitigate GroupLock or Discontinue,required_action_new_Mitigate or Disable SLP,required_action_new_Verify Cisco Compliance
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


I have generated 685 new features for modeling.

**Make the targer binary**

* All `known` ransomware are labled as `1`, and all `unknown` copains are labeled with `0`.

In [46]:
df_dummy["known_ransomware_campaign_use"] = np.where(df.known_ransomware_campaign_use == "Known", 1, 0)
df_dummy.head(3)

Unnamed: 0,vendor_project_Acronis,vendor_project_Adobe,vendor_project_Alcatel,vendor_project_Amcrest,vendor_project_Android,vendor_project_Apache,vendor_project_Apple,vendor_project_Arcadyan,vendor_project_Arcserve,vendor_project_Arm,...,required_action_new_DLink Update or Disconnect,required_action_new_Enable Threat Prevention,required_action_new_End of Life Disconnect,required_action_new_Internet Block and Update,required_action_new_Legacy Disconnect,required_action_new_Log4j Remediation,required_action_new_Mitigate GroupLock or Discontinue,required_action_new_Mitigate or Disable SLP,required_action_new_Verify Cisco Compliance,known_ransomware_campaign_use
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Split and Save all the data sets

In [47]:
# Split the data into 3 sets
train, validate, test = wrangle_.split_data_(df=df, 
                     test_size=0.2, 
                     validate_size= 0.2, 
                     stratify_col="known_ransomware_campaign_use", 
                     random_state=10)
train.shape, validate.shape, test.shape

((659, 5), (220, 5), (220, 5))

In [48]:
df.head(3)

Unnamed: 0,vendor_project,product,known_ransomware_campaign_use,vulnerability_type,required_action_new
0,Accellion,FTA,Known,Command Injection Vulnerability,Apply Updates
1,Accellion,FTA,Known,Command Injection Vulnerability,Apply Updates
2,Accellion,FTA,Known,Command Injection Vulnerability,Apply Updates


In [49]:
# Split the data into 3 sets
train_dummy, validate_dummy, test_dummy = wrangle_.split_data_(df=df_dummy, 
                     test_size=0.2, 
                     validate_size= 0.2, 
                     stratify_col="known_ransomware_campaign_use", 
                     random_state=10)
train_dummy.shape, validate_dummy.shape, test_dummy.shape

((659, 686), (220, 686), (220, 686))

In [50]:
# Save data sets
wrangle_.save_split_data_(original_df=df,
                         encoded_scaled_df= df_dummy,
                         train = train_dummy,
                          validate = validate_dummy,
                          test= test_dummy,
                          folder_path = "./data",
                          test_size= 0.2,
                          stratify_col = "known_ransomware_campaign_use",
                          random_state= 10
                         )

'SIX data sets saved as .csv'