In [1]:
import os
import csv
import pandas as pd


In [2]:
exploit_db_raw_text = os.path.join('data', 'exploitdb.txt')

In [3]:
#read lines encoded as binary
with open(exploit_db_raw_text, 'rb') as f:
    lines = f.readlines()
    
#remove \n at end of lines
lines = [l.strip() for l in lines]

In [4]:
#see sample
lines[:10]

[b'(Bitcoin / Dogecoin) PHP Cloud Mining Script - Authenticat | https://www.exploit-db.com/exploits/42531',
 b"(Gabriel's FTP Server) Open & Compact FTP Server 1.2 - 'PO | https://www.exploit-db.com/exploits/12698",
 b"(Gabriel's FTP Server) Open & Compact FTP Server 1.2 - Aut | https://www.exploit-db.com/exploits/27401",
 b"(Gabriel's FTP Server) Open & Compact FTP Server 1.2 - Ful | https://www.exploit-db.com/exploits/13932",
 b"(Gabriel's FTP Server) Open & Compact FTP Server 1.2 - Uni | https://www.exploit-db.com/exploits/12741",
 b"(Gabriel's FTP Server) Open & Compact FTPd 1.2 - Buffer Ov | https://www.exploit-db.com/exploits/11742",
 b"(Gabriel's FTP Server) Open & Compact FTPd 1.2 - Crash (Po | https://www.exploit-db.com/exploits/11391",
 b"(Gabriel's FTP Server) Open & Compact FTPd 1.2 - Remote Ov | https://www.exploit-db.com/exploits/11420",
 b'(GREEZLE) Global Real Estate Agent Login - Multiple SQL In | https://www.exploit-db.com/exploits/34111',
 b"(Multiple Products) - 'ba

In [5]:
#check length
len(lines)

44094

In [6]:
#read as a csv
df = pd.read_csv(exploit_db_raw_text,
                 sep='|', 
                 header=None, 
                 encoding='latin-1',
                 error_bad_lines=False,  #skip bad lines
                 warn_bad_lines=True) #keep track of badf lines

df.columns = ['title', 'url']
df.head()

b'Skipping line 19437: expected 2 fields, saw 3\nSkipping line 43087: expected 2 fields, saw 3\nSkipping line 43385: expected 2 fields, saw 3\n'


Unnamed: 0,title,url
0,(Bitcoin / Dogecoin) PHP Cloud Mining Script -...,https://www.exploit-db.com/exploits/42531
1,(Gabriel's FTP Server) Open & Compact FTP Serv...,https://www.exploit-db.com/exploits/12698
2,(Gabriel's FTP Server) Open & Compact FTP Serv...,https://www.exploit-db.com/exploits/27401
3,(Gabriel's FTP Server) Open & Compact FTP Serv...,https://www.exploit-db.com/exploits/13932
4,(Gabriel's FTP Server) Open & Compact FTP Serv...,https://www.exploit-db.com/exploits/12741


In [7]:
#inspect what went wrong
lines[19437]

b"Linux Kernel 3.0.0 - 'perf_count_sw_cpu_clock' event Denia | https://www.exploit-db.com/exploits/17769"

In [8]:
#same with this line
lines[43091]

b'BSD/x86 - execve(/bin/sh) Shellcode (27 bytes)             | https://www.exploit-db.com/shellcodes/13246'

In [9]:
#make dict with bad lines
bad_lines_dict = {"Linux Kernel 3.0.0 - 'perf_count_sw_cpu_clock' event Denia": 'https://www.exploit-db.com/exploits/17769%22',
            'BSD/x86 - execve(/bin/sh) + Encoded Shellcode (57 bytes)': 'https://www.exploit-db.com/shellcodes/13252'}
#create separate dataframe
df_bad_lines = pd.DataFrame.from_dict(bad_lines_dict, orient='index')
# reset index
df_bad_lines = df_bad_lines.reset_index()
#add columns
df_bad_lines.columns = ['title', 'url']
df_bad_lines.head()

Unnamed: 0,title,url
0,Linux Kernel 3.0.0 - 'perf_count_sw_cpu_clock'...,https://www.exploit-db.com/exploits/17769%22
1,BSD/x86 - execve(/bin/sh) + Encoded Shellcode ...,https://www.exploit-db.com/shellcodes/13252


In [10]:
#combine the two dataframes
df_final = pd.concat([df, df_bad_lines])

#reset index
df_final = df_final.reset_index(drop=True)
#sort
df_final.sort_index(inplace=True, ascending=True)

df_final.shape

(44093, 2)

In [11]:
#note that titles might apppear duplicate but they are not
df_final.head(10)

Unnamed: 0,title,url
0,(Bitcoin / Dogecoin) PHP Cloud Mining Script -...,https://www.exploit-db.com/exploits/42531
1,(Gabriel's FTP Server) Open & Compact FTP Serv...,https://www.exploit-db.com/exploits/12698
2,(Gabriel's FTP Server) Open & Compact FTP Serv...,https://www.exploit-db.com/exploits/27401
3,(Gabriel's FTP Server) Open & Compact FTP Serv...,https://www.exploit-db.com/exploits/13932
4,(Gabriel's FTP Server) Open & Compact FTP Serv...,https://www.exploit-db.com/exploits/12741
5,(Gabriel's FTP Server) Open & Compact FTPd 1.2...,https://www.exploit-db.com/exploits/11742
6,(Gabriel's FTP Server) Open & Compact FTPd 1.2...,https://www.exploit-db.com/exploits/11391
7,(Gabriel's FTP Server) Open & Compact FTPd 1.2...,https://www.exploit-db.com/exploits/11420
8,(GREEZLE) Global Real Estate Agent Login - Mul...,https://www.exploit-db.com/exploits/34111
9,(Multiple Products) - 'banner.swf' Cross-Site ...,https://www.exploit-db.com/exploits/33760


In [12]:
df_final.to_csv(os.path.join('data', 'exploitdb.csv'))