In [133]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import sys
from typing import List, Tuple

sys.path.append("../../../")
from src.cluster.cluster import (
    generate_full_code_clusters, 
    generate_summarized_clusters,
    generate_graph_clusters,
    generate_random_clusters
)

from src.cluster.types import (
    CodeChunk,
    SummaryChunk,
    ClusterInput,
    ClusteredTopic,
    ClusterInputType,
    LMClusteredTopicList
)


# repo_name = "ell"
repo_name = "CrashOffsetFinder"
repo_path = Path("../../src/cluster/repos") / repo_name


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [134]:
# Generate clusters
full_code_clusters = generate_full_code_clusters(repo_path)
summary_clusters = generate_summarized_clusters(repo_path)
graph_clusters = generate_graph_clusters(repo_path)
random_clusers = generate_random_clusters(repo_path)

Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\CrashOffsetFinder
[Chunker]: 33 chunks used
Unclassified chunks, iter:[1]:  1
Unclassified chunks, iter:[2]:  0
Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\CrashOffsetFinder
[Chunker]: 33 chunks used
Unclassified chunks, iter:[1]:  3
Unclassified chunks, iter:[2]:  0
Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\CrashOffsetFinder
Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\CrashOffsetFinder
[Chunker]: 33 chunks used


In [137]:
import numpy as np
import matplotlib.pyplot as plt

# Print some cluster stats
def get_cluster_stats(clusters: List[ClusteredTopic]):
    # Calculate total number of chunks
    total_chunks = sum(len(cluster.chunks) for cluster in clusters)
    
    # Calculate mean and median number of chunks per cluster
    chunks_per_cluster = [len(cluster.chunks) for cluster in clusters]
    mean_chunks = np.mean(chunks_per_cluster)
    median_chunks = np.median(chunks_per_cluster)
    
    # Print statistics
    print(f"Total number of chunks: {total_chunks}")
    print(f"Mean chunks per cluster: {mean_chunks:.2f}")
    print(f"Median chunks per cluster: {median_chunks:.2f}")
    
    # Create histogram
    # plt.figure(figsize=(10, 6))
    # plt.hist(chunks_per_cluster, bins='auto', edgecolor='black')
    # plt.title('Histogram of Chunks per Cluster')
    # plt.xlabel('Number of Chunks')
    # plt.ylabel('Frequency')
    # plt.show()

print("FUll Code:")
get_cluster_stats(full_code_clusters)
print("Summary:")
get_cluster_stats(summary_clusters)
print("Graph:")
get_cluster_stats(graph_clusters)
print("Random:")
get_cluster_stats(random_clusers)

FUll Code:
Total number of chunks: 33
Mean chunks per cluster: 2.75
Median chunks per cluster: 1.50
Summary:
Total number of chunks: 34
Mean chunks per cluster: 2.83
Median chunks per cluster: 2.50
Graph:
Total number of chunks: 21
Mean chunks per cluster: 3.50
Median chunks per cluster: 3.00
Random:
Total number of chunks: 16
Mean chunks per cluster: 4.00
Median chunks per cluster: 4.00


In [138]:
full_code_ids = [chunk.id for cluster in full_code_clusters for chunk in cluster.chunks]
summary_code_ids = [chunk.id for cluster in summary_clusters for chunk in cluster.chunks]
graph_code_ids = [chunk.id for cluster in graph_clusters for chunk in cluster.chunks]

In [115]:
print(f"Full code clusters: {len(full_code_clusters)}")
print(f"Summary clusters: {len(summary_clusters)}")
print(f"Graph clusters: {len(graph_clusters)}")

Full code clusters: 61
Summary clusters: 38
Graph clusters: 10


In [None]:
################### EVALS ###################

In [141]:
from src.llm.evals.eval_cluster import eval_cross_file_cluster

full_code_crossfile = eval_cross_file_cluster(full_code_clusters)
print("Full code crossfile score: ", full_code_crossfile)

summary_crossfile = eval_cross_file_cluster(summary_clusters)
print("Summary crossfile score: ", summary_crossfile)

graph_crossfile = eval_cross_file_cluster(graph_clusters)
print("Graph crossfile score: ", graph_crossfile)

random_crossfile = eval_cross_file_cluster(random_clusers)
print("Random crossfile score: ", random_crossfile)

Full code crossfile score:  0.14814814814814814
Summary crossfile score:  0.24444444444444444
Graph crossfile score:  0.7305555555555555
Random crossfile score:  1.75


In [157]:
from src.llm.evals.eval_cluster import eval_coherence_cluster

full_code_cohere = eval_coherence_cluster(full_code_clusters, output_file="full_code_cohere.txt")
# graph_cohere = eval_coherence_cluster(graph_clusters)

print("Full code coherence score: ", full_code_cohere)
# print("Graph coherence score: ", graph_cohere)

Full code coherence score:  4.333333333333333


In [62]:
# full_code_ids contain the superset of all code chunks
id_map = {chunk_id: i for i, chunk_id in enumerate(full_code_ids)}

# match clusters to find ones with the most shared chunks
def compare_clusters(cluster_a: List[ClusteredTopic], 
                 cluster_b: List[ClusteredTopic]) -> List[Tuple[ClusteredTopic, ClusteredTopic, int]]:
    """
    Loops through all clusters to find the best match for each cluster in the other set.
    """
    seen = []
    matched_clusters = []
    for i, a in enumerate(cluster_a):
        best_match = None
        best_score = -1
        for b in cluster_b:
            # if b.name in seen:
            #     continue
            
            a_chunk_ids = [id_map[chunk.id] for chunk in a.chunks]
            b_chunk_ids = [id_map[chunk.id] for chunk in b.chunks]
            score = len(set(a_chunk_ids) & set(b_chunk_ids))

            # if i == 12:
            #     print("a chunks: ", [id_map[chunk.id] for chunk in a.chunks])
            #     print("b chunks: ", [id_map[chunk.id] for chunk in b.chunks])
            #     print(score)
            
            if score > best_score:
                best_score = score
                best_match = b
        
        if best_match: 
            matched_clusters.append((a, best_match, best_score))
            seen.append(best_match.name)

    return matched_clusters

matched_clusters = compare_clusters(summary_clusters, full_code_clusters)
for c1, c2, score in sorted(matched_clusters, key=lambda x: x[2], reverse=True):
    print(f"{c1.name} | {c2.name} | {score}")

Real-Time API and Client Management | Conversation APIs and Real-time Communication | 7
Web Application and Server Setup | Interactive CLI with Visual Representation | 6
Factorial Calculation | Interactive CLI with Visual Representation | 5
Store and Database Management | SQL Store and Query Operations | 5
Store Management and Configuration | Configuration and Initialization | 4
User Input Validation and Formatting | OpenAI and LLM Capabilities | 3
Reinforcement Learning Environment Setup and Evaluation | RL Training Using Gym | 3
Data Collection and Processing in Reinforcement Learning | CBPO Reinforcement Learning Algorithm | 3
Model Registration and Handling | Ell Language Modeling | 3
Language Model and Prompt Handling | Language Model Decorator Utilities | 3
Main Application and Serving | Studio Command-Line Interface | 3
String and Data Manipulation | Basic Classes and Methods | 3
Real-time Audio Handling | Real-time Client and Event Handling | 3
Content and Message Processing | 

In [91]:
def print_cluster_by_name(clusters: List[ClusteredTopic], name: str) -> ClusteredTopic:
    for cluster in clusters:
        if cluster.name == name:
            print(cluster.name)
            for i, chunk in enumerate(cluster.chunks):
                print(f"Chunk {i} ----------------------------------")
                print(chunk.get_content())
                if isinstance(chunk, SummaryChunk):
                    print(chunk.get_filecontent())


In [92]:
print_cluster_by_name(summary_clusters, "API Interactions and Middleware")

API Interactions and Middleware
Chunk 0 ----------------------------------
This code cluster defines functions related to API calls and parameter handling for a provider, including translation methods. It facilitates interaction with external APIs by managing parameters and logging.

Provider:provider_call_function(api_call_params: Optional[Dict[str, Any]] = None)
Provider:available_api_params(api_params: Optional[Dict[str, Any]] = None)
Provider:translate_from_provider()
Provider:translate_from_provider(origin_id: Optional[str] = None, logger: Optional[Callable[..., None]] = None)

Chunk 1 ----------------------------------
This code defines a function to serialize an image for use with the Anthropic API. It takes an ImageContent object as input to prepare the image data.
serialize_image_for_anthropic(img : ImageContent)

Chunk 2 ----------------------------------
This code defines a function that converts a content block into a format suitable for the Anthropic API. It is primarily f

In [88]:
print_cluster_by_name(full_code_clusters, "Anthropic Interaction Implementation")

Anthropic Interaction Implementation
Chunk 0 ----------------------------------
from ell.configurator import config
import logging

logger = logging.getLogger(__name__)


try:
    import anthropic

    def register(client: anthropic.Anthropic):
        """
        Register Anthropic models with the provided client.

        This function takes an Anthropic client and registers various Anthropic models
        with the global configuration. It allows the system to use these models
        for different AI tasks.

        Args:
            client (anthropic.Anthropic): An instance of the Anthropic client to be used
                                          for model registration.

        Note:
            The function doesn't return anything but updates the global
            configuration with the registered models.
        """
        model_data = [
            ('claude-3-opus-20240229', 'anthropic'),
            ('claude-3-sonnet-20240229', 'anthropic'),
            ('claude-3-haiku-