In [None]:
https://github.com/HarikishanReddy2004/GenAI_teams_summarizer

In [1]:
#clone repo
from git import Repo
import os
def clone_repo(url,destination="./cloned_repo"):
    if os.path.exists(destination):
        os.system(f"rm -rf {destination}")
    Repo.clone_from(url,destination)
    return destination

clone_repo("https://github.com/HarikishanReddy2004/GenAI_teams_summarizer")

'./cloned_repo'

In [6]:
#gives full list of code files in the folder
def collect_code_files(folder_path,extensions=[".py",".js",".ts"]):
    code_files=[]
    for root,dirs,files in os.walk(folder_path):
        for file in files:
            if any(file.endswith(ext) for ext in extensions):
                code_files.append(os.path.join(root,file))
    return code_files
file_paths=collect_code_files("./cloned_repo")

In [8]:
#filter files based on size 
#You don’t want to feed all files to model — some are config files, others are trivial.  filtered based on filename and size.
def filter_files(file_paths,exclude_keywords=["test","config","setup"],max_size_kb=100):
    filtered_files=[]
    for file in file_paths:
        if any(keyword in file.lower() for keyword in exclude_keywords):
            continue
        if os.path.getsize(file)/1024 > max_size_kb:
            continue
        filtered_files.append(file)
    return filtered_files
filtered_paths=filter_files(file_paths)

In [9]:
#read code from each file and store in dict
def read_code_from_files(file_paths):
    content={}
    for path in file_paths:
        with open(path,"r",encoding="utf-8",errors="ignore") as f:
            content[path]=f.read()
    return content
read_code_from_files(filtered_paths)

{'./cloned_repo\\main.py': 'import os,re\nfrom datetime import datetime\nfrom config import BOARD_ID\nfrom jira_client import jira_client\nfrom transcript_processor import transcript_processor\nfrom summarizer import summarizer\n\ndef match_story_ref_to_key(ref, story_keys):\n    for key in story_keys:\n        if ref in key:\n            return key\n    return None\n\ndef replace_summary_with_person(summary, person_name):\n    return summary.replace("**Summary:**", f"{person_name}:")\n\n\ndef main():\n    with open("D:\\\\teams_genai\\\\venv\\\\conversation.txt", "r") as f:\n        conversation_text = f.read()\n    print(BOARD_ID)\n    print(conversation_text)\n    sprint_id = jira_client.get_active_sprint(BOARD_ID)\n    print(sprint_id)\n    if not sprint_id:\n        print("No active sprint found.")\n        return\n\n    story_map = jira_client.get_issues_in_sprint(sprint_id)\n    story_keys = list(story_map.keys())\n    story_values = list(story_map.values())\n    print(f"Fetched

In [48]:
# Tree Hierarchical File Chunking tree->folder->file->chunks
# Smallest unit = chunk of code (usually a function, class, or block of lines).
class CodeChunk:
    def __init__(self,text,start_line,end_line,summary=""):
        self.text=text  #code text
        self.start_line=start_line
        self.end_line=end_line
        self.summary=summary #summary of the chunk

        def to_dict(self):
            return {
                "start_line": self.start_line,
                "end_line": self.end_line,
                "summary": self.summary,
                "content": self.content
            }



In [56]:
class FileNode:
    def __init__(self,path):
        self.path=path #file path
        self.name = os.path.basename(path)  #file name
        self.chunks=[]  #list of CodeChunk
        self.summary=None #summary of the file

    def to_dict(self):
        return {
            "name": self.name,
            "path": self.path,
            "summary": self.summary,
            "chunks": [chunk.to_dict() for chunk in self.chunks]
        }

In [57]:
class FolderNode:
    def __init__(self,name):
        self.name=name
        self.files={}   #child files
        self.folders={}  #child folders
        self.summary="" #summary of the folder

    def add_file(self, file_node: FileNode):
        self.files[file_node.name] = file_node

    def add_folder(self, folder_node: "FolderNode"):
        self.folders[folder_node.name] = folder_node
        
    def to_dict(self):
        return {
            "name": self.name,
            "summary": self.summary,
            "files": {k: v.to_dict() for k, v in self.files.items()},
            "folders": {k: v.to_dict() for k, v in self.folders.items()},
        }


In [58]:
def chunk_python_code(content: str, max_lines=200):
    chunks = []
    try:
        tree = ast.parse(content)   # ✅ parse Python code into AST
    except Exception:
        return [CodeChunk(content, 1, len(content.splitlines()))]

    for node in ast.walk(tree):     # ✅ walk all nodes in AST
        if isinstance(node, (ast.FunctionDef, ast.ClassDef)):   # ✅ detect function or class
            start_line = node.lineno
            end_line = getattr(node, "end_lineno", start_line + 1)

            lines = content.splitlines()[start_line - 1:end_line]   # ✅ extract code text
            text = "\n".join(lines)

            summary = f"{'Class' if isinstance(node, ast.ClassDef) else 'Function'} `{node.name}`"

            chunks.append(CodeChunk(text, start_line, end_line, summary))

    if not chunks:  # fallback to line-based chunking
        lines = content.splitlines()
        for i in range(0, len(lines), max_lines):
            block = "\n".join(lines[i:i+max_lines])
            chunks.append(CodeChunk(block, i+1, min(i+max_lines, len(lines))))
    return chunks


In [59]:
# ========= 4. TREE BUILDER =========
def build_repo_tree(repo_dict):
    root = FolderNode("root")
    for path, content in repo_dict.items():
        parts = path.split(os.sep)
        curr = root

        # Create nested folder nodes
        for p in parts[:-1]:
            if p not in curr.folders:
                curr.folders[p] = FolderNode(p)
            curr = curr.folders[p]

        # Create file node
        file_node = FileNode(path)
        if path.endswith(".py"):
            file_node.chunks = chunk_python_code(content)
        else:
            file_node.chunks = chunk_other_code(content)

        curr.files[path] = file_node

    return root


In [60]:
# ========= 5. LLM SUMMARIZER =========
def llm_summary(text: str, level="chunk") -> str:
    prompt = f"Summarize this {level} of code in one short, clear sentence:\n\n{text[:1500]}"
    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_tokens=80,
        )
        return resp.choices[0].message.content.strip()
    except Exception as e:
        return f"(LLM summary failed: {e})"


In [61]:
# ========= 6. HIERARCHICAL SUMMARIZATION =========
def summarize_tree(node: FolderNode):
    # Summarize files
    for f in node.files.values():
        for c in f.chunks:
            if not c.summary or c.summary.startswith(("Class", "Function", "Code block")):
                c.summary = llm_summary(c.text, level="chunk")
        f.summary = llm_summary(" ".join([c.summary for c in f.chunks]), level="file")

    # Recurse into subfolders
    for folder in node.folders.values():
        summarize_tree(folder)

    # Summarize folder itself
    child_summaries = [f.summary for f in node.files.values()] + [fld.summary for fld in node.folders.values()]
    node.summary = llm_summary(" ".join(child_summaries), level="folder")


In [62]:
import json
# ========= 7. EXTENDED PIPELINE =========
if __name__ == "__main__":
    # Step 1: Read repo content
    repo_content = read_code_from_files(filtered_paths)

    # Step 2: Build tree
    repo_tree = build_repo_tree(repo_content)

    # Step 3: Generate hierarchical summaries
    summarize_tree(repo_tree)

    # Step 4: Save tree to JSON
    with open("repo_tree.json", "w", encoding="utf-8") as f:
        json.dump(repo_tree.to_dict(), f, indent=2, ensure_ascii=False)

    print("Repo tree with LLM summaries written to repo_tree.json ✅")


AttributeError: 'CodeChunk' object has no attribute 'to_dict'