In [1]:
import os

In [2]:
#To import Github repository 
from git import Repo 
from langchain.text_splitter import Language #To understand the code base Languade
from langchain.document_loaders.generic import GenericLoader # To load the code base
from langchain.document_loaders.parsers import LanguageParser  # Parses the Code Language from the Github repository
from langchain.text_splitter import RecursiveCharacterTextSplitter 
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma
# from langchain.chat_models import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain
from langchain import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser

  from .autonotebook import tqdm as notebook_tqdm


### Clone Github repositories

In [3]:
%pwd

'c:\\Users\\Dell\\Desktop\\Gen AI\\Source-Code-Analysis-Using-GenAI\\research'

In [28]:
!mkdir test_repo

In [31]:
repo_path = "test_repo/"
# Clone repository in test directory :
Repo.clone_from("https://github.com/HarshalGidh/Student-Performance-Prediction", to_path=repo_path)

<git.repo.base.Repo 'c:\\Users\\Dell\\Desktop\\Gen AI\\Source-Code-Analysis-Using-GenAI\\research\\test_repo\\.git'>

In [33]:
# Load the cloned repository
repo_path = "test_repo/"
# Loads the Python Repository
loader = GenericLoader.from_filesystem(repo_path + 'src' ,
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [34]:
documents = loader.load()

In [35]:
documents

[Document(page_content='import sys\nfrom src.logger import logging\n\ndef error_message_detail(error,error_detail:sys):\n    _,_,exc_tb = error_detail.exc_info()\n    file_name=exc_tb.tb_frame.f_code.co_filename\n    error_message="Error Occured in python script name [{0}] line number [{1}] error message [{2}] ".format(\n        file_name,exc_tb.tb_lineno,str(error)\n    )\n    return error_message\n\nclass CustomException(Exception):\n    def __init__(self,error_message,error_detail:sys):\n        super().__init__(error_message)\n        self.error_message=error_message_detail(error_message,error_detail=error_detail)\n\n    def __str__(self):\n        return self.error_message \n    \n', metadata={'source': 'test_repo\\src\\exception.py', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content='import logging\nimport os\nfrom datetime import datetime\n\nLOG_FILE = f"{datetime.now().strftime(\'%m_%d_%Y_%H_%M_%S\')}.log"\nlogs_path= os.path.join(os.getcwd(),"logs",LOG_FILE)\no

### Chunkings

In [36]:
## Context Aware Splitting : 
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 2000,
                                                             chunk_overlap = 200)

In [37]:
texts = documents_splitter.split_documents(documents)

In [38]:
len(texts)

15

### Embedding model

In [39]:
os.environ["GOOGLE_API_KEY"] =  "***************************"

In [43]:
GOOGLE_API_KEY = "***************************"

In [41]:
embeddings=GoogleGenerativeAIEmbeddings(model="models/embedding-001",disallowed_special=(),google_api_key=GOOGLE_API_KEY)

### Knowledge base (vector DB)

In [42]:
vectordb = Chroma.from_documents(texts #data
                                 , embedding=embeddings,# embedding model
                                   persist_directory='./data') #directory to store data
vectordb.persist()

In [66]:
# Creatring Retriever object
retriever = vectordb.as_retriever() 
#  If the length is greater than zero, it means that the retriever is functioning well.
print(len(retriever.get_relevant_documents("data ingestion")))

### LLM Wrapper

In [44]:
# llm = ChatOpenAI(model_name="gpt-4")
# llm = ChatOpenAI()

llm = ChatGoogleGenerativeAI(model="gemini-pro",
                 temperature=0.7, top_p=0.85,google_api_key=GOOGLE_API_KEY)

In [45]:
memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)

In [46]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":3}), memory=memory)

### Q&A

In [47]:
question = "what is DataIngestion class?"

In [49]:
# Prompt template to query Gemini
llm_prompt_template = """You are an assistant for question-answering tasks.
Use the following context to answer the question.
If you don't know the answer, just say that you don't know.
Use five sentences maximum and keep the answer concise.\n
Question: {question} \nContext: {context} \nAnswer:"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

print(llm_prompt)

input_variables=['context', 'question'] template="You are an assistant for question-answering tasks.\nUse the following context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse five sentences maximum and keep the answer concise.\n\nQuestion: {question} \nContext: {context} \nAnswer:"


In [67]:

# Combine data from documents to readable string format.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | llm_prompt
    | llm
    | StrOutputParser()
)

In [70]:
rag_chain.invoke(question)

'The DataIngestion class is responsible for reading and splitting the raw data into training and testing sets. It also saves the training and testing sets to the specified file paths.\n\nThe class has an __init__ method that initializes the ingestion configuration and an initiate_data_ingestion method that reads the raw data, splits it into training and testing sets, and saves the sets to the specified file paths.\n\nThe initiate_data_ingestion method first reads the raw data into a DataFrame. It then creates the directories for the training and testing data if they do not already exist. The DataFrame is then saved to the raw data file path.\n\nThe DataFrame is then split into training and testing sets using the train_test_split function from the sklearn.model_selection module. The training set is saved to the training data file path and the testing set is saved to the testing data file path.\n\nThe initiate_data_ingestion method returns the file paths of the training and testing data 