In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../../../")

from src.chunk.chunk import ChunkStrat
from pathlib import Path

from src.cluster.cluster import ClusterStrategy
from src.cluster.lmp.cluster_v4 import generate_clusters
from src.chunk.chunk import chunk_repo, ChunkStrat

import ell
ell.init(store="logdir")

repo_name = "ell"
repo_path = Path("src/cluster/repos") / repo_name

chunks = chunk_repo(repo_path, ChunkStrat.VANILLA)
# chunks_summarized = chunk_repo(repo_path, ChunkStrat.SUMMARY)
chunks_random = chunk_repo(repo_path, ChunkStrat.RANDOM)

chunk_strat = ClusterStrategy(chunks, 
                              cluster_op=generate_clusters)
# summarized_strat = ClusterStrategy(chunks_summarized,
                                #    cluster_op=generate_clusters)
random_strat = ClusterStrategy(chunks_random,
                                 cluster_op=generate_clusters)



Engine created:  Engine(sqlite:///logdir\ell.db)
Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\ell
[Chunker]: 212 chunks used
Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\ell
[Chunker]: 212 chunks used


In [16]:
from src.llm.evals.closeness import measure_closeness_bytes, measure_closeness_chunks, measure_bytes_agg

def calc_closeness(clusters, chunks):
    for c in clusters:
        print(f"Score: {measure_closeness_chunks(c, chunks)}")
        print(c)

    total_score = sum([measure_closeness_chunks(cluster, chunks) for cluster in clusters]) / len(clusters)
    print ("Total Score: ", total_score)

In [5]:
# Observe the performance of clustering on reduced input sizes
chunk_75 = chunks[:int(len(chunks) * 0.75)]
chunk_50 = chunks[:int(len(chunks) * 0.50)]
chunk_25 = chunks[:int(len(chunks) * 0.25)]

cluster_outputs = []
chunk_inputs = [("25%", chunk_25), ("50%", chunk_50), ("75%", chunk_75)]
for c_name, chunks_i in chunk_inputs:
    print(f"[{c_name}]: size: ", sum([len(chunk.get_content()) for chunk in chunks_i]))

    chunk_strat = ClusterStrategy(chunks_i, 
                              cluster_op=generate_clusters)
    clusters = chunk_strat.run(iters=2)
    cluster_outputs.append(clusters)

    calc_closeness(clusters, chunks_i)

[25%]: size:  74087
[ELL] use_cache for generate_clusters_raw: False
[ELL] use_cache for identify_key_chunks: False
[ELL] use_cache for format_clusters: False
[ELL] use_cache for generate_clusters_raw: False
[ELL] use_cache for identify_key_chunks: False
[ELL] use_cache for format_clusters: False
Chunk Name: 0.1.0\cem.py::7 hallucinated, skipping...
Chunk Name: 0.1.0\cem.py::8 hallucinated, skipping...
Chunk Name: 0.1.0\cppo.py::1 hallucinated, skipping...
Chunk Name: 0.1.0\cppo.py::2 hallucinated, skipping...
Chunk Name: 0.1.0\cppo.py::3 hallucinated, skipping...
Chunk Name: 0.1.0\cppo.py::4 hallucinated, skipping...
Chunk Name: 0.1.0\cppo.py::5 hallucinated, skipping...
Chunk Name: 0.1.0\cppo.py::6 hallucinated, skipping...
Chunk Name: build.py::1 hallucinated, skipping...
Chunk Name: build.py::3 hallucinated, skipping...
Chunk Name: build.py::4 hallucinated, skipping...
Chunk Name: build.py::5 hallucinated, skipping...
Chunk Name: build.py::6 hallucinated, skipping...
Chunk Name: bu

In [31]:
import matplotlib.pyplot as plt
import numpy as np

import plotly.graph_objects as go
import numpy as np

def plot_closeness_histogram(closeness_aggregate, title, bins=500, x_range=None):
    """
    Create and display an interactive histogram of closeness aggregate values.
    
    Parameters:
    -----------
    closeness_aggregate : tuple
        A tuple of (labels, data) where labels contains text for each data point
    title : str
        The title for the plot
    bins : int, optional
        Number of bins in the histogram (default: 500)
    x_range : tuple, optional
        The (min, max) range for x-axis. If None, will be auto-calculated.
    """
    labels, data = closeness_aggregate
    
    # Calculate histogram bins
    if x_range:
        bin_edges = np.linspace(x_range[0], x_range[1], bins + 1)
    else:
        bin_edges = np.linspace(min(data), max(data), bins + 1)
    
    # Calculate which bin each data point belongs to
    bin_indices = np.digitize(data, bin_edges) - 1
    
    # Create lists to store hover text for each bin
    bin_labels = [[] for _ in range(bins)]
    for val, label, bin_idx in zip(data, labels, bin_indices):
        if 0 <= bin_idx < bins:  # Ensure index is within bounds
            bin_labels[bin_idx].append(f"{label}<br>Value: {val:.4f}")
    
    # Create hover text
    hover_text = []
    for i in range(bins):
        if bin_labels[i]:
            hover_text.append("<br>".join(bin_labels[i]))
        else:
            hover_text.append("No data points")

    # Calculate histogram values
    hist_values, _ = np.histogram(data, bins=bin_edges)
    
    # Create the figure
    fig = go.Figure()
    
    # Add histogram trace
    fig.add_trace(go.Bar(
        x=bin_edges[:-1],
        y=hist_values,
        width=(bin_edges[1] - bin_edges[0]),
        hovertext=hover_text,
        hoverinfo='text+y',
        name='Histogram'
    ))
    
    # Add mean and median lines
    mean_val = np.mean(data)
    median_val = np.median(data)
    
    fig.add_trace(go.Scatter(
        x=[mean_val, mean_val],
        y=[0, max(hist_values)],
        mode='lines',
        name=f'Mean: {mean_val:.2f}',
        line=dict(color='red', dash='dash')
    ))
    
    fig.add_trace(go.Scatter(
        x=[median_val, median_val],
        y=[0, max(hist_values)],
        mode='lines',
        name=f'Median: {median_val:.2f}',
        line=dict(color='green', dash='dash')
    ))
    
    # Update layout
    fig.update_layout(
        title=f'Distribution of Closeness Aggregate Values - {title}',
        xaxis_title='Closeness Value',
        yaxis_title='Frequency',
        showlegend=True,
        hovermode='closest',
        width=1000,
        height=600
    )
    
    # Show grid
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='rgba(128, 128, 128, 0.2)')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='rgba(128, 128, 128, 0.2)')
    
    # Display the plot
    fig.show()


In [34]:
## Histograms for plotting closeness of values in a linear range
for clusters, chunk_input in zip(cluster_outputs, chunk_inputs):
    name, chunks_i = chunk_input
    cl_score = []
    for cluster in clusters:
        cl_score.extend(measure_bytes_agg(cluster, chunks_i))

    chunk_sz = sum([len(chunk.get_content()) for chunk in chunks_i])
    labels = ["hello"] * len(cl_score)
    plot_closeness_histogram((labels, cl_score), f"{name}::{chunk_sz / 1000}Kb", x_range=(0, 140000))


In [37]:
# WARNING:: VARIABLE SPACE POLLUTED PAST THIS POINT
repo_name = "dspy"
repo_path = Path(r"C:\Users\jpeng\Documents\projects\codesearch-backend\src\cluster\repos") / repo_name

dspy_chunks = chunk_repo(repo_path, ChunkStrat.VANILLA)
dspy_strat = ClusterStrategy(chunks, 
                              cluster_op=generate_clusters)

Saving chunks to file:  C:\Users\jpeng\AppData\Local\Temp\index\dspy
[Chunker]: 961 chunks used


In [46]:
cl_score = []
chunk_25 = dspy_chunks[:int(len(dspy_chunks) * 0.25)]
chunk_strat = ClusterStrategy(chunks_i, 
                            cluster_op=generate_clusters)
clusters = chunk_strat.run(iters=2)
print("Agg_cl: ", calc_closeness(clusters, chunks_i))

for cluster in clusters:
    cl_score.extend(measure_bytes_agg(cluster, chunks_i))

chunk_sz = sum([len(chunk.get_content()) for chunk in chunks_i])
labels = ["hello"] * len(cl_score)
name = f"Dspy::{chunk_sz / 1000}Kb"

[ELL] use_cache for generate_clusters_raw: False
[ELL] use_cache for identify_key_chunks: False


KeyboardInterrupt: 

In [None]:
plot_closeness_histogram((labels, cl_score), f"{name}::{chunk_sz / 1000}Kb", x_range=(0, 140000))


In [18]:
chunk_25 = chunks[:int(len(chunks) * 0.25)]
cluster_25 = cluster_outputs[0]
for c in cluster_25:
    print(measure_closeness_chunks(c, chunk_25))
    print(c)

Curr chunk:  ell\configurator.py::8
Prev chunk:  build.py::2
Curr chunk:  ell\configurator.py::9
Prev chunk:  ell\configurator.py::8
Score!
Curr chunk:  lmp\_track.py::1
Prev chunk:  ell\configurator.py::9
Score!
Curr chunk:  lmp\_track.py::2
Prev chunk:  lmp\_track.py::1
Score!
Curr chunk:  lmp\_track.py::4
Prev chunk:  lmp\_track.py::2
Score!
Curr chunk:  models\__init__.py::1
Prev chunk:  lmp\_track.py::4
Curr chunk:  models\anthropic.py::1
Prev chunk:  models\__init__.py::1
Score!
Curr chunk:  models\bedrock.py::1
Prev chunk:  models\anthropic.py::1
Score!
Curr chunk:  models\groq.py::1
Prev chunk:  models\bedrock.py::1
Score!
Curr chunk:  models\ollama.py::1
Prev chunk:  models\groq.py::1
Score!
0.8
Build and Testing Automation:
build.py::2
ell\configurator.py::8
ell\configurator.py::9
lmp\_track.py::1
lmp\_track.py::2
lmp\_track.py::4
models\__init__.py::1
models\anthropic.py::1
models\bedrock.py::1
models\groq.py::1
models\ollama.py::1


Curr chunk:  0.1.0\cem.py::2
Prev chunk: 

In [19]:
for c in chunk_25:
    print(c.id)

build.py::2
0.1.0\autostreamprevention.py::1
0.1.0\cem.py::1
0.1.0\cem.py::2
0.1.0\cem.py::3
0.1.0\cem.py::4
0.1.0\cem.py::5
0.1.0\cem.py::6
0.1.0\context_versioning.py::1
0.1.0\cpbo.py::1
0.1.0\cpbo.py::2
0.1.0\cpbo.py::3
0.1.0\cpbo.py::4
0.1.0\cpbo.py::5
0.1.0\cpbo.py::6
0.1.0\cpbo.py::7
0.1.0\metapromptingtorch.py::1
0.1.0\mypytest.py::1
0.1.0\test.py::1
src\conf.py::1
src\conf.py::2
ell\__init__.py::1
ell\__version__.py::1
ell\configurator.py::1
ell\configurator.py::2
ell\configurator.py::3
ell\configurator.py::4
ell\configurator.py::5
ell\configurator.py::6
ell\configurator.py::7
ell\configurator.py::8
ell\configurator.py::9
lmp\__init__.py::1
lmp\_track.py::1
lmp\_track.py::2
lmp\_track.py::3
lmp\_track.py::4
lmp\complex.py::1
lmp\complex.py::2
lmp\complex.py::3
lmp\complex.py::4
lmp\complex.py::5
lmp\simple.py::1
lmp\simple.py::2
lmp\tool.py::1
lmp\tool.py::2
lmp\tool.py::3
lmp\tool.py::4
models\__init__.py::1
models\anthropic.py::1
models\bedrock.py::1
models\groq.py::1
models\

In [20]:
test_cluster = cluster_25[-2]
measure_closeness_chunks(test_cluster, chunk_25)

Curr chunk:  models\anthropic.py::1
Prev chunk:  models\__init__.py::1
Score!
Curr chunk:  models\bedrock.py::1
Prev chunk:  models\anthropic.py::1
Score!
Curr chunk:  models\groq.py::1
Prev chunk:  models\bedrock.py::1
Score!
Curr chunk:  models\ollama.py::1
Prev chunk:  models\groq.py::1
Score!


0.2857142857142857

In [21]:
# testing random chunks 
chunks_rand_50 = chunks_random[:int(len(chunks) * 0.50)]
chunk_strat = ClusterStrategy(chunks_rand_50, 
                            cluster_op=generate_clusters)
r_clusters = chunk_strat.run(iters=2)

bytes_score = sum([measure_closeness_bytes(cluster, chunks_i) for cluster in r_clusters]) / len(r_clusters)
chunks_score = sum([measure_closeness_chunks(cluster, chunks_i) for cluster in r_clusters]) / len(r_clusters)
print("Bytes score: ", bytes_score)
print("Chunks score: ", chunks_score)


[ELL] use_cache for generate_clusters_raw: False
[ELL] use_cache for identify_key_chunks: False
[ELL] use_cache for format_clusters: False
[ELL] use_cache for generate_clusters_raw: False
[ELL] use_cache for identify_key_chunks: False
[ELL] use_cache for format_clusters: False
Chunk Name: openai_realtime\api.py::1 hallucinated, skipping...
Chunk Name: providers\anthropic.py::2 hallucinated, skipping...
Chunk Name: types\_lstr.py::1 hallucinated, skipping...
Chunk Name: models\openai.py::1 hallucinated, skipping...
Chunk Name: models\bedrock.py::1 hallucinated, skipping...
Chunk Name: util\closure_py::9 hallucinated, skipping...
Chunk Name: store\__init__.py::1 hallucinated, skipping...
Chunk Name: stores\sql.py::1 hallucinated, skipping...
Chunk Name: stores\sql.py::3 hallucinated, skipping...
Chunk Name: util\closure.py::11 hallucinated, skipping...
Chunks clustered this round: 109/106
New inputs: 34
[ELL] use_cache for generate_clusters_raw: False
[ELL] use_cache for identify_key_chu

ValueError: CodeChunk(id='openai_realtime\\client.py::2', input_type=<ClusterInputType.FILE: 'file'>, content="class RealtimeClient(RealtimeEventHandler):\n\n    def _add_api_event_handlers(self):\n        self.realtime.on('client.*', lambda event: self.dispatch('realtime.event', {\n            'time': RealtimeUtils.generate_id('time_'),\n            'source': 'client',\n            'event': event\n        }))\n        self.realtime.on('server.*', lambda event: self.dispatch('realtime.event', {\n            'time': RealtimeUtils.generate_id('time_'),\n            'source': 'server',\n            'event': event\n        }))\n        self.realtime.on('server.session.created', lambda _: setattr(self, 'session_created', True))\n\n        def handle_conversation_event(event, *args):\n            result = self.conversation.process_event(event, *args)\n            if result['item']:\n                self.dispatch('conversation.updated', result)\n            return result\n\n        self.realtime.on('server.response.created', handle_conversation_event)\n        self.realtime.on('server.response.output_item.added', handle_conversation_event)\n        self.realtime.on('server.response.content_part.added', handle_conversation_event)\n        self.realtime.on('server.input_audio_buffer.speech_started', lambda event: (\n            handle_conversation_event(event),\n            self.dispatch('conversation.interrupted', event)\n        ))\n        self.realtime.on('server.input_audio_buffer.speech_stopped', lambda event: \n            handle_conversation_event(event, self.input_audio_buffer)\n        )\n        self.realtime.on('server.conversation.item.created', lambda event: (\n            handle_conversation_event(event),\n            self.dispatch('conversation.item.appended', {'item': event['item']})\n        ))\n        self.realtime.on('server.conversation.item.truncated', handle_conversation_event)\n        self.realtime.on('server.conversation.item.deleted', handle_conversation_event)\n        self.realtime.on('server.conversation.item.input_audio_transcription.completed', handle_conversation_event)\n        self.realtime.on('server.response.audio_transcript.delta', handle_conversation_event)\n        self.realtime.on('server.response.audio.delta', handle_conversation_event)\n        self.realtime.on('server.response.text.delta', handle_conversation_event)\n        self.realtime.on('server.response.function_call_arguments.delta', handle_conversation_event)\n        def handle_output_item_done( event):\n            handle_conversation_event(event)\n            item = event.get('item', {})\n\n            if item.get('status') == 'completed':\n                self.dispatch('conversation.item.completed', {'item': item})\n\n            formatted = item.get('formatted', {})\n            tool = formatted.get('tool') if isinstance(formatted, dict) else None\n\n            if tool:\n                asyncio.create_task(self._call_tool(tool))\n        self.realtime.on('server.response.output_item.done', handle_output_item_done)", summary='', filepath='x\\openai_realtime\\src\\openai_realtime\\client.py', metadata=None, node_id='openai_realtime\\client.py::2') is not in list

In [26]:
bytes_score = sum([measure_closeness_bytes(cluster, chunks) for cluster in r_clusters]) / len(r_clusters)
chunks_score = sum([measure_closeness_chunks(cluster, chunks) for cluster in r_clusters]) / len(r_clusters)
print("Bytes score: ", bytes_score)
print("Chunks score: ", chunks_score)


Bytes score:  0.0783816425120773
Chunks score:  0.4035369220151829


In [27]:
for c in r_clusters:
    print(measure_closeness_chunks(c, chunks))
    print(c)

0.7777777777777778
Model Configuration and Registration:
ell\configurator.py::1
ell\configurator.py::3
ell\configurator.py::4
ell\configurator.py::5
ell\configurator.py::6
ell\configurator.py::8
ell\configurator.py::9
models\groq.py::1
models\ollama.py::1
models\anthropic.py::1


0.75
Real-Time Client and Event Handling:
openai_realtime\client.py::1
openai_realtime\client.py::2
openai_realtime\client.py::3
openai_realtime\client.py::4
openai_realtime\client.py::6
openai_realtime\client.py::7
openai_realtime\api.py::2
openai_realtime\api.py::3
openai_realtime\conversation.py::1


1.0
Message Handling and Processing:
types\message.py::5
types\message.py::6
types\message.py::9
types\message.py::10
types\message.py::11
types\message.py::12
types\message.py::13
types\message.py::14
types\message.py::15


0.4444444444444444
Storage and State Management:
stores\sql.py::4
stores\sql.py::5
stores\sql.py::7
stores\sql.py::9
stores\sql.py::10
ell\store.py::1
ell\store.py::3
ell\configurator.py::1

In [13]:
# test using anon chunk names using incremented IDs
# Chunk1 -> Chunk2 -> Chunk3 -> ...

chunk_50 = chunks[:int(len(chunks) * 0.50)]
anon_strat = ClusterStrategy(chunk_50, 
                              cluster_op=generate_clusters)
anon_clusters = anon_strat.run(iters=2, use_anon_chunks=True)

[ELL] use_cache for generate_clusters_raw: False
[ELL] use_cache for identify_key_chunks: False
[ELL] use_cache for format_clusters: False
[ELL] use_cache for generate_clusters_raw: False
[ELL] use_cache for identify_key_chunks: False
[ELL] use_cache for format_clusters: False
Converting anon name:  Chunk::0
Converting anon name:  Chunk::1
Converting anon name:  Chunk::2
Converting anon name:  Chunk::3
Converting anon name:  Chunk::4
Converting anon name:  Chunk::5
Converting anon name:  Chunk::6
Converting anon name:  Chunk::7
Converting anon name:  Chunk::8
Converting anon name:  Chunk::9
Converting anon name:  Chunk::10
Converting anon name:  Chunk::11
Converting anon name:  Chunk::12
Converting anon name:  Chunk::13
Converting anon name:  Chunk::14
Converting anon name:  Chunk::15
Converting anon name:  Chunk::16
Converting anon name:  Chunk::17
Converting anon name:  Chunk::18
Converting anon name:  Chunk::19
Converting anon name:  Chunk::20
Converting anon name:  Chunk::21
Conver

In [17]:
for cluster in anon_clusters:
    print(measure_closeness_chunks(cluster, chunk_50))
    print(cluster)

# Conclusion:
# Somewhat interesting that this has a worst closeness score than just using chunk ids in the form of 
# file_name::i, where i is the ith chunk
# Guess we can conclude that the chunk header file information is very important for clustering, and actually
# used to promote constructing clusters with at farther distances

1.0
Build and Deployment Process:
build.py::2
0.1.0\autostreamprevention.py::1
0.1.0\cem.py::1
0.1.0\cem.py::2
0.1.0\cem.py::3
0.1.0\cem.py::4
0.1.0\cem.py::5
0.1.0\cem.py::6
0.1.0\context_versioning.py::1
0.1.0\cpbo.py::1
0.1.0\cpbo.py::2


1.0
CBPO Algorithm and Training:
0.1.0\cpbo.py::3
0.1.0\cpbo.py::4
0.1.0\cpbo.py::5
0.1.0\cpbo.py::6
0.1.0\cpbo.py::7
0.1.0\metapromptingtorch.py::1
0.1.0\mypytest.py::1
0.1.0\test.py::1
src\conf.py::1
src\conf.py::2
ell\__init__.py::1


1.0
Providers and Model Registrations:
ell\__version__.py::1
ell\configurator.py::1
ell\configurator.py::2
ell\configurator.py::3
ell\configurator.py::4
ell\configurator.py::5
ell\configurator.py::6
ell\configurator.py::7
ell\configurator.py::8
ell\configurator.py::9
lmp\__init__.py::1
lmp\_track.py::1
lmp\_track.py::2
lmp\_track.py::3
lmp\_track.py::4


1.0
OpenAI and Ell Providers:
lmp\complex.py::1
lmp\complex.py::2
lmp\complex.py::3
lmp\complex.py::4
lmp\complex.py::5
lmp\simple.py::1
lmp\simple.py::2
lmp\tool.

In [None]:
# Notes: 2024-10-30
# TODO: plot histograms for anon chunks
# TODO: figure out the window size for measure_closeness_bytes such that it outputs the same score as
# chunks so that we have an idea of the relative "area of influence" that the clustering algorithm pays 
# attention to
# TODO: run test on DSPy to double check

# We can do two (3?) things to promote closeness:
# 1. Change the relative positions of the chunks, to force attention to consider different
# chunks side by side
# 2. Reduce the chunk size by using a compressed representation .ie summarization
# 3. Make better use of the header information. From our tests with anon and summarized chunks, we can
# conclude that the header contains quite a lot of information that the LM uses to cluster

# First way:
# Reorder chunks 

# Random thoughts:
# Something about the linux codebase being 80% drivers code and 20% adapter code can probably be used 
# as a first pass filter for most codebases