In [1]:
# installing dependencies
!pip install langchain-google-genai langchain_community autopep8 gitpython

Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.11-py3-none-any.whl.metadata (3.6 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting autopep8
  Downloading autopep8-2.3.2-py2.py3-none-any.whl.metadata (16 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.16 (from langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.16-py3-none-any.whl.metadata (5.7 kB)
Collecting langchain-core<0.4.0,>=0.3.37 (from langchain-google-genai)
  Downloading langchain_core-0.3.43-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.20 (from langchain_community)
  Downloading langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl

In [2]:
# Import necessary modules with error handling
try:
    from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
    from langchain_community.document_loaders import WebBaseLoader
    from langchain.chains import StuffDocumentsChain
    from langchain.chains.llm import LLMChain
    from langchain.prompts import PromptTemplate
    import google.generativeai as genai
    import os
    from google.colab import userdata
    import git
    import glob
    import subprocess
    import autopep8  # For Python code formatting
    import shutil
except ModuleNotFoundError as e:
    print(f"Module not found: {e}")
    # Install missing modules if necessary
    !pip install langchain-google-genai langchain-community

# Configure API key and initialize model
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = userdata.get('api_key')

genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

# Function to initialize model
def initialize_model():
    try:
        llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")
        return llm
    except ValueError as e:
        print(f"Error: {e}")
        print("Switching to alternative model.")
        # Check if alternative model is available, otherwise handle accordingly
        try:
            from langchain_alternative_module import ChatAlternativeModel  # Placeholder name
            llm = ChatAlternativeModel(model="alternative-model")  # Specify the correct model name if needed
            return llm
        except ModuleNotFoundError:
            print("Alternative model module not found.")
            return None

# Initialize the model
llm = initialize_model()



In [3]:
# Clone the repository (or use a local directory)
REPO_URL = "https://github.com/Hossain-Shah/Robi_Datathon_0100_pandas.git"
LOCAL_REPO_PATH = "/content/drive/MyDrive/Colab_Notebook/Robi_Datathon_0100_pandas"

def clone_repo(repo_url, local_path):
    """Clones a GitHub repository if not already cloned."""
    if not os.path.exists(local_path):
        print(f"Cloning repository from {repo_url}...")
        git.Repo.clone_from(repo_url, local_path)
    else:
        print("Repository already cloned.")

def read_code_files(repo_path, extensions=[".py", ".ipynb", ".js", ".java", ".cpp", ".ts"]):
    """Reads all code files from the repository."""
    files = []
    for ext in extensions:
        files.extend(glob.glob(f"{repo_path}/**/*{ext}", recursive=True))

    code_contents = {}
    for file in files:
        with open(file, "r", encoding="utf-8", errors="ignore") as f:
            code_contents[file] = f.read()
    return code_contents

def summarize_code(file_path, code):
    """Summarizes code using Google-gemini model."""
    prompt_template = PromptTemplate.from_template("Summarize the following code from {file_path}:\n\n{code}\n\nSummary:")
    prompt = prompt_template.format(file_path=file_path, code=code)
    llm_chain = LLMChain(llm=llm, prompt=prompt_template)
    stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="code")
    response = llm_chain.run({"file_path": file_path, "code": code})
    return response

def analyze_repository():
    """Main function to analyze a repository."""
    clone_repo(REPO_URL, LOCAL_REPO_PATH)
    code_files = read_code_files(LOCAL_REPO_PATH)

    repo_summary = []
    for file, code in code_files.items():
        print(f"Analyzing {file}...")
        summary = summarize_code(file, code[:2000])  # Limit input size
        repo_summary.append(f"📄 **{file}**:\n{summary}\n")

    # Generate final repository summary
    final_summary_template = PromptTemplate.from_template("Provide an overview of the repository structure based on these file summaries:\n\n{repo_summaries}")
    final_prompt = final_summary_template.format(repo_summaries="\n".join(repo_summary))
    llm_chain = LLMChain(llm=llm, prompt=final_summary_template)
    final_summary = llm_chain.run({"repo_summaries": "\n".join(repo_summary)})

    # Print and save the summary
    print("\n📝 Repository Summary:\n", final_summary)
    with open("/content/drive/MyDrive/Colab_Notebooks/repo_summary.txt", "w") as f:
        f.write(final_summary)

# Run the analysis
if __name__ == "__main__":
    analyze_repository()

Repository already cloned.
Analyzing /content/drive/MyDrive/Colab_Notebook/Robi_Datathon_0100_pandas/shahnawaz/utils/Robi_Datathon_problems_solution.ipynb...


  llm_chain = LLMChain(llm=llm, prompt=prompt_template)
  stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="code")
  response = llm_chain.run({"file_path": file_path, "code": code})



📝 Repository Summary:
 The repository structure is relatively simple, containing a Jupyter Notebook within a nested directory structure reflecting its location on Google Drive.  All the data files used by the notebook are assumed to be in the same directory as the notebook itself.

```
Colab_Notebook/
└── Robi_Datathon_0100_pandas/
    └── shahnawaz/
        └── utils/
            └── Robi_Datathon_problems_solution.ipynb
            └── purchase.csv  (Assumed location)
            └── boxes.csv      (Assumed location)
            └── problem1.csv   (Assumed location)
            └── submission_1.csv (Output file generated by the notebook)
            └── ... (Other data files for Problem 2 - names unknown)

```

The notebook `Robi_Datathon_problems_solution.ipynb` is the core component, containing the Python code for solving the data manipulation and prediction tasks.  It resides within a directory structure likely created through Google Colab and the user's Google Drive organization

In [None]:
# Q&A System for Code Repository

def answer_question_about_code(question, code_files):
    """Answers questions based on the code repository."""
    relevant_code = "\n".join([f"File: {file}\n{code[:2000]}" for file, code in code_files.items()])  # Limit input size
    prompt_template = PromptTemplate.from_template("""
    You are an AI assistant analyzing a code repository. Answer the following question based on the provided code snippets:

    Question: {question}

    Code Repository Snippets:
    {relevant_code}

    Answer:
    """)

    llm_chain = LLMChain(llm=llm, prompt=prompt_template)
    response = llm_chain.run({"question": question, "relevant_code": relevant_code})
    return response


def analyze_code_best_practices(code_files):
    """Analyzes best practices, issues, and warnings in the repository."""
    repo_analysis = []
    for file, code in code_files.items():
        print(f"Analyzing best practices for {file}...")
        prompt_template = PromptTemplate.from_template("""
        Review the following code snippet from {file} and provide best practices, potential issues, and warnings:

        Code:
        {code}

        Best Practices:
        -

        Issues & Warnings:
        -
        """)

        llm_chain = LLMChain(llm=llm, prompt=prompt_template)
        response = llm_chain.run({"file": file, "code": code[:2000]})  # Limit input size
        repo_analysis.append(f"""📄 **{file}**:
{response}
""")

    return "\n".join(repo_analysis)


def interactive_qa_system():
    """Runs an interactive Q&A system for the repository."""
    code_files = read_code_files(LOCAL_REPO_PATH)
    print("\n📢 Repository Q&A System Initialized! Type 'exit' to quit.\n")
    while True:
        question = input("Ask a question about the code repository: ")
        if question.lower() == 'exit':
            break
        response = answer_question_about_code(question, code_files)
        print("\n🤖 AI Response:\n", response, "\n")

# Run best practices analysis
if __name__ == "__main__":
    code_files = read_code_files(LOCAL_REPO_PATH)
    best_practices_report = analyze_code_best_practices(code_files)
    print("\n🛠 Best Practices & Issues Report:\n", best_practices_report)

    # Save report
    with open("/content/drive/MyDrive/Colab_Notebooks/best_practices_report.txt", "w") as f:
        f.write(best_practices_report)

    # Start Q&A System
    interactive_qa_system()

Analyzing best practices for /content/drive/MyDrive/Colab_Notebook/Robi_Datathon_0100_pandas/shahnawaz/utils/Robi_Datathon_problems_solution.ipynb...

🛠 Best Practices & Issues Report:
 📄 **/content/drive/MyDrive/Colab_Notebook/Robi_Datathon_0100_pandas/shahnawaz/utils/Robi_Datathon_problems_solution.ipynb**:
```python
#Problem 1
import pandas as pd

# Load datasets using pathlib for more robust path handling
from pathlib import Path
data_dir = Path("/content/drive/MyDrive/Colab Notebooks/")  # Define data directory

try:
    purchase_data = pd.read_csv(data_dir / "purchase.csv")
    boxes_data = pd.read_csv(data_dir / "boxes.csv")
    problem_data = pd.read_csv(data_dir / "problem1.csv")
except FileNotFoundError:
    print("Error: One or more data files not found. Check the path.")
    # Handle the error appropriately, e.g., exit the script


# Merge purchase and boxes datasets using a more informative merge method
merged_data = pd.merge(purchase_data, boxes_data, on="BOX_ID", how="le