In [1]:
import warnings
warnings.filterwarnings('ignore')

import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [2]:
import os
import random
import re
import wandb
from rich.markdown import Markdown

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import GitLoader
from langchain.text_splitter import MarkdownTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from langchain.prompts import PromptTemplate

In [4]:
loader = GitLoader(
    clone_url="https://github.com/payloadbox/sql-injection-payload-list",
    repo_path="./github_data/sql_repo/",
    branch="master",
    file_filter=lambda file_path: file_path.endswith(".md")
)

data = loader.load()

In [5]:
data[0].page_content

'### SQL Injection Payload List\r\n\r\n<p align="center">\r\n  <img src="https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg"> <img src="https://img.shields.io/github/stars/payloadbox/sql-injection-payload-list?style=social"> <img src="https://img.shields.io/github/forks/payloadbox/sql-injection-payload-list?style=social"> <img src="https://img.shields.io/github/repo-size/payloadbox/sql-injection-payload-list"> <img src="https://img.shields.io/github/license/payloadbox/sql-injection-payload-list"> <img src="https://img.shields.io/github/issues/detail/author/payloadbox/command-injection-payload-list/1">\r\n</p>\r\n\r\n#### SQL Injection\r\n\r\nIn this section, we\'ll explain what SQL injection is, describe some common examples, explain how to find and exploit various kinds of SQL injection vulnerabilities, and summarize how to prevent SQL injection. \r\n\r\n#### What is SQL injection (SQLi)?\r\n\r\nSQL injection is a web security vulnerab

In [6]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

data[0].page_content = remove_html_tags(data[0].page_content)

In [13]:
# we need a single line of code to start tracing langchain with W&B
os.environ["LANGCHAIN_WANDB_TRACING"] = "true"

# here we are configuring the wandb project name
os.environ["WANDB_PROJECT"] = "sql-injection-tools"

## Parsing

In [8]:
md_text_splitter = MarkdownTextSplitter(chunk_size=4000, chunk_overlap=500)
document_sections = md_text_splitter.split_documents(data)
len(document_sections)

18

In [9]:
Markdown(document_sections[2].page_content)

## Embeddings

In [10]:
# We will use the OpenAIEmbeddings to embed the text, and Qdrant to store the vectors
embeddings = OpenAIEmbeddings()
db = Qdrant.from_documents(document_sections, 
                           embeddings,
                           path="/tmp/local_qdrant_db",
                           collectio_name="sql_injection_tools")

In [11]:
db_retriever = db.as_retriever()
db_retriever

VectorStoreRetriever(vectorstore=<langchain.vectorstores.qdrant.Qdrant object at 0x000001CB45F2B090>, search_type='similarity', search_kwargs={})

In [16]:
query = "List some Generic Union Select Payloads"
docs = db_retriever.get_relevant_documents(query)

Error in WandbTracer.on_retriever_start callback: retriever
Error in WandbTracer.on_retriever_end callback: No retriever Run found to be traced


In [17]:
prompt_template = """
Use the following context to answer the question at the end.
If you don't know the answer, just say that you don't know, try not to make up an answer.

{context}

Question: {question}
Answer:
"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

context = docs[0].page_content
prompt = PROMPT.format(context=context, question=query)

In [19]:
llm = ChatOpenAI(temperature=0.0)
response = llm.predict(prompt)
Markdown(response)

In [20]:
query = "List some Generic UNION SELECT Payloads"
docs = db_retriever.get_relevant_documents(query)
context = docs[0].page_content
prompt = PROMPT.format(context=context, question=query)

response = llm.predict(prompt)
Markdown(response)

Error in WandbTracer.on_retriever_start callback: retriever
Error in WandbTracer.on_retriever_end callback: No retriever Run found to be traced


In [21]:
query = "List some Generic Time Based SQL Injection Payloads"
docs = db_retriever.get_relevant_documents(query)
context = docs[0].page_content
prompt = PROMPT.format(context=context, question=query)

response = llm.predict(prompt)
Markdown(response)

Error in WandbTracer.on_retriever_start callback: retriever
Error in WandbTracer.on_retriever_end callback: No retriever Run found to be traced
