In [3]:
from git import Repo
import os

from dotenv import load_dotenv

from langchain.document_loaders.generic import GenericLoader
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.document_loaders.parsers import LanguageParser
from langchain.vectorstores import Chroma
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

import warnings
warnings.filterwarnings("ignore")

load_dotenv()

GEMINI_API_KEY = "AIzaSyDlHdr-HiAALsowWrprOm4ofc3QYFZZ7-8"

In [4]:
os.getenv("GEMINI_API_KEY")

'AIzaSyDlHdr-HiAALsowWrprOm4ofc3QYFZZ7-8'

In [5]:
import google.generativeai as genai

genai.configure(api_key=GEMINI_API_KEY)

for model in genai.list_models():
    print(model.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-001
models/gemini-1.5-flash-latest
models/gemini-1.5-pro
models/gemini-1.5-pro-001
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision
models/embedding-001
models/text-embedding-004
models/aqa


Cloning a GIT Repo

In [6]:
# Making a test folder for saving git repo for testing
!mkdir test_repo

# Cloning a repo 
Repo.clone_from("https://github.com/Manav446/Sport_Classification_Using_Images_Project.git", to_path="test_repo/")

A subdirectory or file test_repo already exists.


GitCommandError: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/Manav446/Sport_Classification_Using_Images_Project.git test_repo/
  stderr: 'fatal: destination path 'test_repo' already exists and is not an empty directory.
'

In [9]:
repo_path = "test_repo/"
loader = GenericLoader.from_filesystem(
    repo_path + "src/cnnClassifier", 
    glob="**/*", suffixes=[".py"], 
    parser=LanguageParser(
        language=Language.PYTHON, 
        parser_threshold=500
    )
)

In [10]:
document_loader = loader.load()

Document Chunking

In [11]:
document_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size = 2000,
    chunk_overlap = 400
)

In [12]:
document_chunks = document_splitter.split_documents(document_loader)

# Loading Embedding model

In [13]:
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GEMINI_API_KEY)

# Loading Chroma Vector DB for saving document embeddings

In [14]:
chroma_db = Chroma.from_documents(
    document_chunks, 
    embedding=gemini_embeddings, 
    persist_directory="./data"
)

chroma_db.persist()

# Loading Google GEMINI LLM MODEL Wrapper

In [15]:
llm_model = ChatGoogleGenerativeAI(model = "models/gemini-1.5-pro", temperature=0.7, top_p=0.8, google_api_key=GEMINI_API_KEY, convert_system_message_to_human=True)

Loading memory object for LLM chatbot

In [16]:
momory_obj = ConversationSummaryMemory(llm=llm_model, 
                                       memory_key="chat_history", 
                                       return_messages=True)

In [17]:
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm_model, 
    retriever=chroma_db.as_retriever(
        search_type="mmr", search_kwargs={"k": 3}
        ), 
    memory=momory_obj
)

# Question and Anwering

In [21]:
user_query = "what is a common class?"

response = conversation_chain.invoke(user_query)

In [22]:
response

{'question': 'what is a common class?',
 'chat_history': [SystemMessage(content='New summary:\nThe human asked about a "DataIngestion" class. The AI clarified that the provided context only defines a "DataIngestionConfig" dataclass and a "ConfigurationManager" class, not a "DataIngestion" class. \n')],
 'answer': 'This code doesn\'t provide a direct answer to what a "common class" is in this specific context. It\'s possible that "common" refers to:\n\n* **A module named "common":**  The code imports `from src.cnnClassifier.utils.common import *`. This suggests there\'s a Python module (`common.py`) containing utility functions or classes used throughout the project.  Without seeing the contents of `common.py`, we can\'t be sure what\'s inside.\n* **A base class (not shown):** It\'s possible there\'s a base class (e.g., `CommonConfig`) that these data classes inherit from, but it\'s not defined in the provided code snippet.\n\n**To get a definitive answer, you\'d need to examine the con

In [45]:
from langchain.prompts import PromptTemplate

llm_prompt_template = """You are an powerful assistantfor question-answering tasks You have to tell the Usr.
Use the following context to answer the question.
If you don't know the answer, just say that you don't know.
Use five sentences maximum and keep the answer concise.\n
Question: {question} \nContext: {context} \nAnswer:
"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

print(llm_prompt)

input_variables=['context', 'question'] template="You are an powerful assistantfor question-answering tasks.\nUse the following context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse five sentences maximum and keep the answer concise.\n\nQuestion: {question} \nContext: {context} \nAnswer:\n"


In [46]:
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain.schema.runnable import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in document_chunks)

In [47]:
rag_chain = (
    {"context": chroma_db.as_retriever(
        search_type="mmr", search_kwargs={"k": 3}
    ) | format_docs, "question": RunnablePassthrough() }
    | llm_prompt 
    | llm_model
    | StrOutputParser()
)

In [48]:
response_2 = rag_chain.invoke(user_query)

In [49]:
response_2

'The DataIngestion class is a Python class designed for downloading and preparing a dataset. It fetches data from a specified URL, typically a zip file.  After downloading, it extracts the contents of the zip file into a designated directory. Finally, it removes the original downloaded zip file to keep only the extracted data. In essence, it automates the process of obtaining and readying a dataset from a remote source. \n'

In [20]:
def checking():
    try:
        a = 1+2
    except Exception as e:
        raise Exception(e)
    return a

checking()

3