In [1]:
from git import Repo
import os

from dotenv import load_dotenv

from langchain.document_loaders.generic import GenericLoader
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.document_loaders.parsers import LanguageParser
from langchain.vectorstores import Chroma
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

import warnings
warnings.filterwarnings("ignore")

load_dotenv()

GEMINI_API_KEY = "AIzaSyDlHdr-HiAALsowWrprOm4ofc3QYFZZ7-8"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.getenv("GEMINI_API_KEY")

'AIzaSyDlHdr-HiAALsowWrprOm4ofc3QYFZZ7-8'

In [3]:
import google.generativeai as genai

genai.configure(api_key=GEMINI_API_KEY)

for model in genai.list_models():
    print(model.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-001
models/gemini-1.5-flash-latest
models/gemini-1.5-pro
models/gemini-1.5-pro-001
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision
models/embedding-001
models/text-embedding-004
models/aqa


Cloning a GIT Repo

In [5]:
# Making a test folder for saving git repo for testing
!mkdir test_repo

# Cloning a repo 
Repo.clone_from("https://github.com/Manav446/Sport_Classification_Using_Images_Project.git", to_path="test_repo/")

<git.repo.base.Repo 'd:\\Manav\\Personal_Projects\\Source-Code_Analysis_Using_Gen_AI\\research\\test_repo\\.git'>

In [6]:
repo_path = "test_repo/"
loader = GenericLoader.from_filesystem(
    repo_path + "src/cnnClassifier", 
    glob="**/*", suffixes=[".py"], 
    parser=LanguageParser(
        language=Language.PYTHON, 
        parser_threshold=500
    )
)

In [7]:
loader2 = GenericLoader.from_filesystem(
    repo_path, 
    glob="**/*", suffixes=[".py"], 
    parser=LanguageParser(
        language=Language.PYTHON, 
        parser_threshold=500
    )
)

In [8]:
document_loader1 = loader.load()
document_loader2 = loader2.load()

Document Chunking

In [9]:
document_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size = 2000,
    chunk_overlap = 400
)

In [10]:
document_chunks = document_splitter.split_documents(document_loader1)
document_chunks2 = document_splitter.split_documents(document_loader2)

In [15]:
document_chunks

[Document(page_content='import os\nimport zipfile\nimport gdown\nfrom src.cnnClassifier.utils.common import get_size\nfrom src.cnnClassifier.entity.config_entity import (DataIngestionConfig)\nfrom src.cnnClassifier.constants import constants\n\nfrom src.logger import logging\n\nlogger = logging.getLogger("DataIngestion")\n\nclass DataIngestion:\n    def __init__(self, config: DataIngestionConfig):\n        self.config = config\n\n    \n    def download_file(self)-> str:\n        \'\'\'\n        Fetch data from the url\n        \'\'\'\n\n        try: \n            dataset_url = self.config.source_URL\n            zip_download_dir = self.config.local_data_file\n            os.makedirs(self.config.unzip_dir, exist_ok=True)\n            logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")\n\n            file_id = dataset_url.split("/")[-2]\n            prefix = constants.GOOGLE_DRIVE_DOWNLOAD_PREFIX_URL\n            gdown.download(prefix+file_id,zip_download_dir

In [13]:
document_chunks2

[Document(page_content='from flask import Flask, request, jsonify, render_template\nimport os\nfrom flask_cors import CORS, cross_origin\n\nfrom src.cnnClassifier.utils.common import decodeImage\nfrom src.cnnClassifier.pipeline.stage_04_prediction import PredictionPipeline\n\nfrom src.logger import logging\nfrom src.exception import CustomException\n\nlogger = logging.getLogger("App")\n\nos.putenv(\'LANG\', \'en_US.UTF-8\')\nos.putenv(\'LC_ALL\', \'en_US.UTF-8\')\n\napp = Flask(__name__)\nCORS(app)\n\nclass ClientApp:\n    def __init__(self):\n        self.fileName = "inputImage.jpg"\n        self.classifier = PredictionPipeline(self.fileName)\n\n\n@app.route("/", methods=[\'GET\'])\n@cross_origin()\ndef home():\n    return render_template(\'index.html\')\n\n\n@app.route("/train", methods=[\'GET\',\'POST\'])\n@cross_origin()\ndef trainRoute():\n    os.system("python main.py")\n    # os.system("dvc repro")\n    return "Training done successfully!"\n\n\n@app.route("/predict", methods=[\'

In [14]:
len(document_chunks), len(document_chunks2)

(20, 26)

# Loading Embedding model

In [16]:
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GEMINI_API_KEY)

# Loading Chroma Vector DB for saving document embeddings

In [17]:
chroma_db = Chroma.from_documents(
    document_chunks2, 
    embedding=gemini_embeddings, 
    persist_directory="./data"
)

chroma_db.persist()

# Loading Google GEMINI LLM MODEL Wrapper

In [18]:
llm_model = ChatGoogleGenerativeAI(model = "models/gemini-1.5-pro", temperature=0.7, top_p=0.8, google_api_key=GEMINI_API_KEY, convert_system_message_to_human=True)

Loading memory object for LLM chatbot

In [19]:
momory_obj = ConversationSummaryMemory(llm=llm_model, 
                                       memory_key="chat_history", 
                                       return_messages=True)

In [20]:
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm_model, 
    retriever=chroma_db.as_retriever(
        search_type="mmr", search_kwargs={"k": 3}
        ), 
    memory=momory_obj
)

# Question and Anwering

In [21]:
user_query = "what is a DataIngestion class?"

response = conversation_chain.invoke(user_query)

In [22]:
response

{'question': 'what is a DataIngestion class?',
 'chat_history': [SystemMessage(content='')],
 'answer': "This code doesn't contain a `DataIngestion` class. It defines a `DataIngestionConfig` dataclass, which holds configuration information for data ingestion, but there's no class named `DataIngestion` present. \n"}

In [23]:
from langchain.prompts import PromptTemplate

llm_prompt_template = """You are an powerful assistantfor question-answering tasks You have to tell the Usr.
Use the following context to answer the question.
If you don't know the answer, just say that you don't know.
Use five sentences maximum and keep the answer concise.\n
Question: {question} \nContext: {context} \nAnswer:
"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

print(llm_prompt)

input_variables=['context', 'question'] template="You are an powerful assistantfor question-answering tasks You have to tell the Usr.\nUse the following context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse five sentences maximum and keep the answer concise.\n\nQuestion: {question} \nContext: {context} \nAnswer:\n"


In [28]:
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain.schema.runnable import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [29]:
rag_chain = (
    {"context": chroma_db.as_retriever(
        search_type="mmr", search_kwargs={"k": 3}
    ) | format_docs, "question": RunnablePassthrough() }
    | llm_prompt 
    | llm_model
    | StrOutputParser()
)

In [38]:
response_2 = rag_chain.invoke(input="What does a config_entity file contains?")

In [39]:
response_2

'A config_entity file seems to define classes related to configurations, such as `DataIngestionConfig`, `PrepareBaseModelConfig`, and `TrainingConfig`. These classes likely hold parameters and settings for different stages of a machine learning pipeline.  The provided code snippet shows how instances of these classes are created and populated with data from YAML configuration files.  However, the exact contents of a config_entity file are not shown in the provided code. \n'