In [None]:
"""
Vendor Articles from CVEs
"""

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

from tqdm import tqdm
tqdm.pandas(desc="my bar!")
import numpy as np

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# model = SentenceTransformer('stsb-roberta-large') # deprecated
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')


In [None]:
#test = pd.DataFrame(np.random.randint(0, 100, (100000, 6)))

#test.progress_apply(lambda x: x.sum(), axis=1).shape[0] == test.shape[0]


In [None]:

spaces_pattern = re.compile('\s+') # remove new lines \n and \t

URL = "https://www.jenkins.io/security/advisory/2023-06-14/"

headers = {
   "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
}



def request_articles(url_list):
    soup_list = []
    for url in url_list:
        soup = beautiful_request(url)
        if soup:
            article_text_list = text_prep(soup)
            soup_list.append(article_text_list)
    return request_articles


def beautiful_request(url):
    # sometimes you'll get a ConnectionError
    try:
        # url = 'https://github.com/syz913/CVE-reports/blob/main/CVE-2023-31821.md'
        if 'github.com' in url:
            url += '?raw=true'
            response = requests.get(url = url, timeout=(4, 5), allow_redirects=True, headers = headers)
        else:
            response = requests.get(url = url, timeout=(4, 5), allow_redirects=False, headers=headers)
    except:
        response = None
    if (response is not None) and (response.status_code in range(200, 300)):
        # Before passing response.text to Beautifulsoup
        # we need to remove any \n that are within sentences and
        # unrelated to HTML new lines
        htmltext = response.text.replace('.\n', '...') # prevent sentences from splitting
        soup = BeautifulSoup(htmltext, 'lxml')
        htmltext = soup.text.split('\n')
        # does this after is trickier...or at least requires more thought
        # this is faster for now...
        return htmltext
    return None


def spaces_filter(text_list):
    # ['this', ' ', ' ', 'that', 'this']
    return len(text_list) > 0


def word_count(text):
    text_list = text.split(' ') # get rid of extra spaces so they don't count
    text_list = list(filter(spaces_filter, text_list))
    return len(text_list)


def word_count_filter(text):
    n = word_count(text)
    return n > 3


def text_prep(soup):
    # we get all this html code, let's just grab the text
    text = soup.text.split('\n')
    #text # we get a lot of empty junk, so filter it out
    text = list(filter(spaces_filter, text))
    text = list(filter(word_count_filter, text)) # should I combine this with the above?
    text_list = text
    return text_list


def remove_newlines(text):
    # remove \n and \t
    text = re.sub(spaces_pattern, ' ', text)
    return text


def find_text_similarity(text_list, embedding1=None, nvd_description=None):
    if nvd_description:
        embedding1 = model.encode(nvd_description, convert_to_tensor=True)
    scores = []
    for line in tqdm(text_list):
        embedding2 = model.encode(line, convert_to_tensor=True)
        # compute similarity scores of two embeddings
        cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
        scores.append(float(cosine_scores))
    return scores


def fetch_relevant_text(scores, text_list):
    idx = pd.Series(scores).idxmax()
    # fetch the relevant text
    rel_text = text_list[max(0, idx-2):min(len(text_list), idx+3)]
    return rel_text, idx

In [None]:
def main(row):
    scraped_vendor_articles = []
    nvd_description = row['description']
    embedding_description = model.encode(nvd_description, convert_to_tensor=True)
    articles = map(beautiful_request, row['references_url_list'])
    articles = list(filter(lambda art: art is not None, articles))

    for article_text_list in articles:
        if len(article_text_list) < 400:
            scores = find_text_similarity(article_text_list, embedding1=embedding_description)
            relevant_text, _ = fetch_relevant_text(scores, article_text_list)
            vendor_article = "...".join(relevant_text) # this is one complete article from one web page
            scraped_vendor_articles.append(vendor_article)
        else:
            print("too fucking long...")

    scraped_vendor_articles = " --- ".join(scraped_vendor_articles)
    scraped_vendor_articles = remove_newlines(scraped_vendor_articles) # remove \t, \s, \r, \n
    return scraped_vendor_articles


In [None]:
df = pd.read_csv("data/nvd_cve_metrics.txt", sep="|", index_col=0).sample(400, random_state=456)
df.head()

In [None]:
df['references_url_list'].isnull().sum()

In [None]:
df['references_url_list'] = df['references_url_list'].fillna('[]').apply(eval)

In [None]:
df.sort_values(by='publishedDate', ascending=False, inplace=True)

In [None]:
import datetime as dt

In [None]:
results = []
startTime = dt.datetime.now()

for i, row in df.head(50).iterrows():
    output = main(row)
    results.append(output)
    print(i, dt.datetime.now() - startTime)
    print(output)


In [None]:
output

In [None]:
row

In [None]:
row['references_url_list']

In [None]:
#requests.get('http://www.securityfocus.com/bid/49500')
#response = requests.get('http://www.redhat.com/support/errata/RHSA-2011-1249.html', allow_redirects=False)

In [None]:
response = requests.get('https://www.unisoc.com/en_us/secy/announcementDetail/1676902764208259073', allow_redirects=False)
response.text

In [None]:
headers = {
   "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
}

In [None]:
response = requests.get(url = 'https://www.unisoc.com/en_us/secy/announcementDetail/1676902764208259073', timeout=(4, 5), allow_redirects=False, headers=headers)
response.text

In [None]:
response.json()

In [None]:
with open('stuff.html', 'w+') as mf:
    mf.writelines(response.text)

In [None]:
soup = BeautifulSoup(response.text, 'lxml')
soup.text

In [None]:
htmltext = beautiful_request('https://www.unisoc.com/en_us/secy/announcementDetail/1676902764208259073')


In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
soup

In [None]:
len(soup.text)

In [None]:
test = '   **Vulnerability name:**  Exposure of secret in ALBIS\r'
remove_newlines(test)

In [None]:
out = soup.prettify()

In [None]:
htmltext = out.split('\n')
htmltext

In [None]:
for i, line in enumerate(htmltext):
    if 'CVE-2023-30932' in line:
        print(i, line)
        break

In [None]:
line

In [None]:
# syz913/CVE-reports/blob/main/CVE-2023-31821.md
url = 'https://raw.githubusercontent.com/syz913/CVE-reports/main/CVE-2023-31821.md?raw=true'
response = requests.get(url = url, timeout=(4, 5), allow_redirects=False)
response.text
# https://github.com/syz913/CVE-reports/blob/main/CVE-2023-31821.md?raw=true

In [None]:
url = 'https://github.com/syz913/CVE-reports/blob/main/CVE-2023-31821.md'
if 'github.com' in url:
    url += '?raw=true'
    response = requests.get(url = url, timeout=(4, 5), allow_redirects=True)
    print(response.text)