In [None]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import sys
from typing import List, Tuple

sys.path.append("../../../")
from src.cluster.cluster import (
    generate_full_code_clusters, 
    generate_summarized_clusters,
    generate_graph_clusters,
    generate_random_clusters
)


from src.cluster.types import (
    CodeChunk,
    SummaryChunk,
    ClusterInput,
    ClusteredTopic,
    ClusterInputType,
    LMClusteredTopicList
)

from src.cluster.chunk_repo import ChunkStrat

# repo_name = "ell"
repo_name = "ell"
repo_path = Path("../../src/cluster/repos") / repo_name


# TODO:
# AutoSearching for n, k:
# Goal: want to automatically find the values of n and k given that
# the cohere_score evaluation function is probably gonna be changed in the
# future
#
# Add a function that automatically recalculates n and k in:
# n * cohere_score
# num_files ** k / num_clusters (for cross_file)
# This should be a simple search for increasing values of k over a range 
# R (of length ... 7?) such that the following condition holds:
# For the first half (1, R/2]
# The ranking of the score is: (Fullcode, Cgraph, ..., Random)
# For the second half (R/2 + 1, R)
# The ranking of the score is: (Cgraph, Fullcode, ..., Random)
# 
# Then once the lowest value of n is found
# R is found that satisfies above, take k to be R / 2 and call it a day


In [3]:
import json
from src.config import GRAPH_ROOT, REPOS_ROOT
from rtfs.chunk_resolution.chunk_graph import ChunkGraph
from rtfs.transforms.cluster import cluster

def generate_cgraph_clusters() -> List[ClusteredTopic]:
    ell_json = json.loads(open(GRAPH_ROOT / "MadcowD_ell_standard.json", "r").read())
    cg = ChunkGraph.from_json(REPOS_ROOT / "MadcowD_ell", ell_json)

    cluster(cg)

    return [
        ClusteredTopic(
            name="Graph Cluster",
            chunks=[
                CodeChunk(
                    id=chunk.og_id,
                    content=chunk.content,
                    filepath=chunk.file_path,
                    input_type=ClusterInputType.CHUNK,
                ).dict() for chunk in cluster.chunks
            ],
        ) 
        for cluster in cg.get_clusters()
    ]

In [4]:
# Generate clusters
full_code_clusters = generate_full_code_clusters(repo_path)
summary_clusters = generate_summarized_clusters(repo_path)
graph_clusters = generate_graph_clusters(repo_path)
cgraph_clusters = generate_cgraph_clusters()
random_clusters = generate_random_clusters(repo_path, num_clusters = 10)
rsummarized_clusters = generate_summarized_clusters(repo_path, chunk_strat=ChunkStrat.RANDOM)


Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\ell
[Chunker]: 212 chunks used
Unclassified chunks, iter:[1]:  65
Unclassified chunks, iter:[2]:  2
Unclassified chunks, iter:[3]:  0
Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\ell
[Chunker]: 212 chunks used
[Summarize Chunk] Chunk too long: 8743, continuing...
Summary tokens: 8604,           Code tokens: 60918,           Ratio: 0.14123904264749335
Unclassified chunks, iter:[1]:  147
Unclassified chunks, iter:[2]:  64
Unclassified chunks, iter:[3]:  2
Unclassified chunks, iter:[4]:  0
Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\ell
[Chunker]: 212 chunks used
Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\ell
[Chunker]: 212 chunks used


: 

In [5]:
print(f"Full code clusters: {len(full_code_clusters)}")
print(f"Summary clusters: {len(summary_clusters)}")
print(f"Random Summary clusters: {len(rsummarized_clusters)}")
print(f"Graph clusters: {len(graph_clusters)}")
print(f"CGraph clusters: {len(cgraph_clusters)}")

Full code clusters: 45
Summary clusters: 52
Graph clusters: 10
CGraph clusters: 16


In [None]:
################### EVALS ###################

In [6]:
from src.llm.evals.eval_cluster import eval_coherence_clusters

iters = 3

full_code_coherence = eval_coherence_clusters(full_code_clusters, iters, "full_code", repo_name, subdir="full_code")
graph_coherence = eval_coherence_clusters(graph_clusters, iters, "graph", repo_name, subdir="graph")
cgraph_coherence = eval_coherence_clusters(cgraph_clusters, iters, "cgraph", repo_name, subdir="cgraph")
random_coherence = eval_coherence_clusters(random_clusters, iters, "random", repo_name, subdir="random")
summary_coherence = eval_coherence_clusters(summary_clusters, iters, "summary", repo_name, subdir="summary")


Creating new instance for c:\Users\jpeng\Documents\projects\codesearch-backend\src\llm\evals\../../..\src\llm\evals\eval_cluster.py


In [None]:
from src.llm.evals.eval_cluster import eval_coherence_clusters


In [7]:
print("Full code coherence: ", full_code_coherence)
print("Graph coherence: ", graph_coherence)
print("Cgraph coherence: ", cgraph_coherence)
print("Random coherence: ", random_coherence)
print("Summary coherence: ", summary_coherence)

Full code coherence:  4.395833333333333
Graph coherence:  4.2
Cgraph coherence:  3.9393939393939394
Random coherence:  3.033333333333333
Summary coherence:  4.344086021505376


In [8]:
def eval_cross_file_single(cluster: ClusteredTopic, f_const: float = 2.0) -> float:
    # Calculate the number of unique files in the cluster
    unique_files = set(chunk.filepath for chunk in cluster.chunks 
                       if chunk.filepath is not None)
    num_files = len(unique_files)
    num_chunks = len(cluster.chunks)

    # Avoid division by zero
    if num_chunks == 0:
        return 0.0

    # Calculate the ratio of files to chunks
    score = num_files ** f_const / num_chunks

    return num_files, num_chunks, score
    

def eval_cross_file_cluster(clusters: List[ClusteredTopic], f_const: float = 2.0, min_chunks: int = 3) -> float:
    cross_file_scores = [eval_cross_file_single(cluster, f_const = f_const)[2] for cluster in clusters 
                         if len(cluster.chunks) >= min_chunks]

    # Calculate the average cross-file score
    if len(cross_file_scores) > 0:
        avg_cross_file_score = sum(cross_file_scores) / len(cross_file_scores)
    else:
        avg_cross_file_score = 0.0

    return avg_cross_file_score


In [10]:
f_const_vals = [1.1, 1.2, 1.3, 1.4]
cohere_scores = [full_code_coherence, graph_coherence, cgraph_coherence, random_coherence, summary_coherence]
clusters = [full_code_clusters, graph_clusters, cgraph_clusters, random_clusters, summary_clusters]
labels = ["Full Code", "Graph", "Cgraph", "Random", "Summary"]

for f_const in f_const_vals:
    cross_file_scores = [
        eval_cross_file_cluster(cluster, f_const=f_const) for cluster in clusters
    ] 
    final_eval = [( 1.6 * cohere_sore + cross_score, label) for cohere_sore, cross_score, label
                   in zip(cohere_scores, cross_file_scores, labels)]
    final_eval = sorted(final_eval, key=lambda x: x[0], reverse=True)
    
    print("Results for f_const: ", f_const)

    print(f"1.{final_eval[0][1]}: ", final_eval[0][0])
    print(f"2.{final_eval[1][1]}: ", final_eval[1][0])
    print(f"3.{final_eval[2][1]}: ", final_eval[2][0])
    print(f"4.{final_eval[3][1]}: ", final_eval[3][0])
    print(f"5.{final_eval[4][1]}: ", final_eval[4][0])



Results for f_const:  1.1
1.Full Code:  7.2830141697258535
2.Graph:  7.241129182189325
3.Cgraph:  7.234413237902033
4.Summary:  7.216055801234271
5.Random:  5.8773892985417255
Results for f_const:  1.2
1.Cgraph:  7.388836167380329
2.Full Code:  7.307053000698173
3.Graph:  7.302710162788448
4.Summary:  7.2277164328422305
5.Random:  6.018757361681725
Results for f_const:  1.3
1.Cgraph:  7.569770334663936
2.Graph:  7.372867229768563
3.Full Code:  7.335297378655064
4.Summary:  7.240877320949681
5.Random:  6.179880024334345
Results for f_const:  1.4
1.Cgraph:  7.7819138692136605
2.Graph:  7.452980013709861
3.Full Code:  7.368488327986757
4.Summary:  7.255754332833053
5.Random:  6.36354768146329


In [11]:
f_const_vals = [1.1, 1.2, 1.3, 1.4]
cohere_scores = [full_code_coherence, graph_coherence, cgraph_coherence, random_coherence, summary_coherence]
clusters = [full_code_clusters, graph_clusters, cgraph_clusters, random_clusters, summary_clusters]
labels = ["Full Code", "Graph", "Cgraph", "Random", "Summary"]

for f_const in f_const_vals:
    cross_file_scores = [
        eval_cross_file_cluster(cluster, f_const=f_const) for cluster in clusters
    ]
    final_eval = [(cohere_sore * cross_score, label) for cohere_sore, cross_score, label
                   in zip(cohere_scores, cross_file_scores, labels)]
    final_eval = sorted(final_eval, key=lambda x: x[0], reverse=True)
    
    print("Results for f_const: ", f_const)

    print(f"1.{final_eval[0][1]}: ", final_eval[0][0])
    print(f"2.{final_eval[1][1]}: ", final_eval[1][0])
    print(f"3.{final_eval[2][1]}: ", final_eval[2][0])
    print(f"4.{final_eval[3][1]}: ", final_eval[3][0])
    print(f"5.{final_eval[4][1]}: ", final_eval[4][0])



Results for f_const:  1.1
1.Cgraph:  3.66908428888863
2.Random:  3.1063030944654573
3.Graph:  2.1887425651951635
4.Summary:  1.153433756963121
5.Full Code:  1.0975553433087866
Results for f_const:  1.2
1.Cgraph:  4.277417041378886
2.Random:  3.5351195526567887
3.Graph:  2.4473826837114787
4.Summary:  1.204088543733182
5.Full Code:  1.203226037791274
Results for f_const:  1.3
1.Cgraph:  4.990188003405217
2.Random:  4.023858296036403
3.Graph:  2.742042365027961
4.Full Code:  1.3273836158934431
5.Summary:  1.2612605737913545
Results for f_const:  1.4
1.Cgraph:  5.8259049576920106
2.Random:  4.580983522660869
3.Graph:  3.0785160575814134
4.Full Code:  1.4732854973306773
5.Summary:  1.325887593155681


In [62]:
# full_code_ids contain the superset of all code chunks
id_map = {chunk_id: i for i, chunk_id in enumerate(full_code_ids)}

# match clusters to find ones with the most shared chunks
def compare_clusters(cluster_a: List[ClusteredTopic], 
                 cluster_b: List[ClusteredTopic]) -> List[Tuple[ClusteredTopic, ClusteredTopic, int]]:
    """
    Loops through all clusters to find the best match for each cluster in the other set.
    """
    seen = []
    matched_clusters = []
    for i, a in enumerate(cluster_a):
        best_match = None
        best_score = -1
        for b in cluster_b:
            # if b.name in seen:
            #     continue
            
            a_chunk_ids = [id_map[chunk.id] for chunk in a.chunks]
            b_chunk_ids = [id_map[chunk.id] for chunk in b.chunks]
            score = len(set(a_chunk_ids) & set(b_chunk_ids))

            # if i == 12:
            #     print("a chunks: ", [id_map[chunk.id] for chunk in a.chunks])
            #     print("b chunks: ", [id_map[chunk.id] for chunk in b.chunks])
            #     print(score)
            
            if score > best_score:
                best_score = score
                best_match = b
        
        if best_match: 
            matched_clusters.append((a, best_match, best_score))
            seen.append(best_match.name)

    return matched_clusters

matched_clusters = compare_clusters(summary_clusters, full_code_clusters)
for c1, c2, score in sorted(matched_clusters, key=lambda x: x[2], reverse=True):
    print(f"{c1.name} | {c2.name} | {score}")

Real-Time API and Client Management | Conversation APIs and Real-time Communication | 7
Web Application and Server Setup | Interactive CLI with Visual Representation | 6
Factorial Calculation | Interactive CLI with Visual Representation | 5
Store and Database Management | SQL Store and Query Operations | 5
Store Management and Configuration | Configuration and Initialization | 4
User Input Validation and Formatting | OpenAI and LLM Capabilities | 3
Reinforcement Learning Environment Setup and Evaluation | RL Training Using Gym | 3
Data Collection and Processing in Reinforcement Learning | CBPO Reinforcement Learning Algorithm | 3
Model Registration and Handling | Ell Language Modeling | 3
Language Model and Prompt Handling | Language Model Decorator Utilities | 3
Main Application and Serving | Studio Command-Line Interface | 3
String and Data Manipulation | Basic Classes and Methods | 3
Real-time Audio Handling | Real-time Client and Event Handling | 3
Content and Message Processing | 