In [75]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('./demo', glob="**/*.java")
docs = loader.load()
len(docs)

6

In [90]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter

os.environ["OPENAI_API_KEY"]='xxxxx'

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.split_documents(docs)
db = Chroma.from_documents(documents, OpenAIEmbeddings())
db.persist()
db.get()

{'ids': ['82391e32-a864-11ee-86e4-9e15c6918cf2',
  '82391e33-a864-11ee-95ed-9e15c6918cf2',
  '82391e34-a864-11ee-9f82-9e15c6918cf2',
  '82391e35-a864-11ee-84da-9e15c6918cf2',
  '82391e36-a864-11ee-b469-9e15c6918cf2',
  '82391e37-a864-11ee-849a-9e15c6918cf2'],
 'embeddings': None,
 'metadatas': [{'source': 'demo\\src\\main\\java\\com\\example\\demo\\DemoApplication.java'},
  {'source': 'demo\\src\\main\\java\\com\\example\\demo\\client\\ExampleClient.java'},
  {'source': 'demo\\src\\main\\java\\com\\example\\demo\\controller\\ExampleController.java'},
  {'source': 'demo\\src\\main\\java\\com\\example\\demo\\service\\ReadFileSvc.java'},
  {'source': 'demo\\src\\main\\java\\com\\example\\demo\\service\\WriteFileSvc.java'},
  {'source': 'demo\\src\\test\\java\\com\\example\\demo\\DemoApplicationTests.java'}],
 'documents': ['package com.example.demo;\n\nimport org.springframework.boot.SpringApplication;\n\nimport org.springframework.boot.autoconfigure.SpringBootApplication;\n\n@SpringBootA

In [81]:
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain.output_parsers import PydanticOutputParser

class ConfirmResult(BaseModel):
    match: str = Field(description="answer is yes or no")
    reason: str = Field(description="reason and suggestion")

In [91]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema import StrOutputParser
from langchain_core.prompts import format_document
from string import Template
import asyncio

querys = (
    ("find occurance of something like 1.1.1.1", "hard coded ip address"),
    ("use servelet session", "use servelet session"),
    ("find file system usage", "write something to local file system"),
)
for query in querys:
    found_docs = db.similarity_search(query[0], k=3)
    for i, doc in enumerate(found_docs):
        doc_prompt = PromptTemplate.from_template("{page_content}")
        confirmTemplate = Template("""
            Does following content has $violation? The answer format should start with Yes or No and follow with the reason:
            {content}
        """)    

        chain = (
            {
                "content": lambda doc: format_document(doc, doc_prompt),
            }
            | PromptTemplate.from_template(confirmTemplate.substitute(violation=query[1]))
            | ChatOpenAI()
            | StrOutputParser()
        )
        result = chain.invoke(doc)
        await asyncio.sleep(5)
        if (result.startswith("No")):
            break
        print(result, "\n")
        print(doc.metadata, "\n")

Yes, the following content has a hard-coded IP address. 
The reason is that the URL "http://10.112.01.10" is directly specified in the code without any dynamic or variable component. This means that the code is specifically programmed to access a resource at the IP address "10.112.01.10". 

{'source': 'demo\\src\\main\\java\\com\\example\\demo\\client\\ExampleClient.java'} 

Yes, the following content uses Servlet session.

Reason: The HttpSession session parameter is included in the method signature of the getIpAddressAndSession() method. This indicates that the method requires access to the session object. Additionally, the session object is used to set an attribute called "clientIpAddress" using the session.setAttribute() method. 

{'source': 'demo\\src\\main\\java\\com\\example\\demo\\controller\\ExampleController.java'} 

Yes, the following content writes something to the local file system.

Reason: The code uses the FileWriter class to write the provided data to a file. The FileW

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-rVVCSroW43WKcfaVt5n3w4kN on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}