In [5]:
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPEN_AI_KEY"] = api_key

In [6]:
### FULLY GENERATED ANALYZER SCRIPT

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

import random


# Step 1: Load the code files from the repository
loader = DirectoryLoader(
    path=r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\mobile_insight',
    glob='**/*.py',             # Only .py files
    exclude=['**/__pycache__/**', '**/*.pyc']  # Exclude cache and .pyc files
)
documents = loader.load()

# Step 2: Split the code files into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=['\n\n', '\n', ' ', '']
)
texts = text_splitter.split_documents(documents)

# Step 3: Create embeddings for the chunks
embeddings = OpenAIEmbeddings()

# Step 4: Store the embeddings in a vectorstore
vectorstore = Chroma.from_documents(texts, embeddings)

def get_random_context(k=5):
    # Randomly select k documents from the list of texts
    selected_docs = random.sample(texts, min(k, len(texts)))
    context = '\n\n'.join([doc.page_content for doc in selected_docs])
    return context

# Step 6: Read the two example files
with open(r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\testcases\attach_stats\attach_stats.py', 'r') as f:
    example1 = f.read()

with open(r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\testcases\interval_stats\interval_stats.py', 'r') as f:
    example2 = f.read()

# Step 7: Create the prompt template
prompt = PromptTemplate(
    input_variables=['example1', 'example2', 'context'],
    template="""
You are an AI assistant that generates code for analyzers using the given Python library. Here are two examples of analyzer code:

Example 1:
{example1}

Example 2:
{example2}

Using the above examples as a guide, please generate a new analyzer code that is different from the examples provided. 
The new analyzer should demonstrate a unique functionality or feature of the library.
NOTE: ONLY PROVIDE PYTHON CODE, DO NOT ADD ANY OTHER TEXT BEFORE OR AFTER AS THIS OUTPUT IS BEING SAVED DIRECTLY INTO A PY FILE.

You have access to the following relevant code snippets from the library:
{context}

Please make sure to properly use the library's functions and classes as per the context provided.

Generated Code:
"""
)

# Step 8: Initialize the LLM
llm = ChatOpenAI(model_name='gpt-4o', temperature=0.7)


# Step 9: Define the function to generate code examples
def generate_code_examples(num_examples):
    generated_examples = []
    chain = LLMChain(llm=llm, prompt=prompt)
    
    for _ in range(num_examples):
        # Retrieve random context documents
        context = get_random_context(k=5)
        
        # Prepare the inputs to the prompt
        inputs = {
            'example1': example1,
            'example2': example2,
            'context': context
        }
        
        # Run the chain to generate code
        result = chain.invoke(inputs)
        
        # Append the generated code to the list
        generated_code = result['text'].strip()
        
        # Append the generated code to the list
        generated_examples.append(generated_code)
        
    return generated_examples

In [16]:
# Step 10: Generate multiple code examples
num_examples_to_generate = 5  # Adjust the number as needed
generated_codes = generate_code_examples(num_examples_to_generate)

# Step 11: Save the generated codes to files or print them
for idx, code in enumerate(generated_codes):
    filename = f'generated_analyzer_{23+idx+1}.py'
    with open(filename, 'w') as f:
        f.write(code)
    print(f"Generated Code {3+idx+1} saved to {filename}")

Generated Code 4 saved to generated_analyzer_24.py
Generated Code 5 saved to generated_analyzer_25.py
Generated Code 6 saved to generated_analyzer_26.py
Generated Code 7 saved to generated_analyzer_27.py
Generated Code 8 saved to generated_analyzer_28.py


In [11]:
### GENERATED PROMPTS ANALYZER PAIRS SCRIPT

import json

# Step 1: Load real prompt-code pairs from folders
def load_prompt_code_pairs(directory):
    prompt_code_pairs = []
    
    # Each subdirectory contains a prompt (txt) and code (py) file
    for folder_name in os.listdir(directory):
        if folder_name == 'logs':
            continue
        folder_path = os.path.join(directory, folder_name)
        
        if os.path.isdir(folder_path):
            # Find the txt and py files
            prompt_file = next((f for f in os.listdir(folder_path) if f.endswith('.txt')), None)
            code_file = next((f for f in os.listdir(folder_path) if f.endswith('.py')), None)
            
            if prompt_file and code_file:
                # Load the prompt
                with open(os.path.join(folder_path, prompt_file), 'r') as f:
                    prompt = f.read().strip()
                
                # Load the code
                with open(os.path.join(folder_path, code_file), 'r') as f:
                    code = f.read().strip()
                
                # Append to the list as a dict
                prompt_code_pairs.append({
                    'prompt': prompt,
                    'code': code
                })
    
    return prompt_code_pairs

# Load real prompt-code pairs
real_examples_directory = r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\testcases'
real_prompt_code_pairs = load_prompt_code_pairs(real_examples_directory)

# Step 2: Create the prompt template for generating new prompts based on Python code
prompt_template = PromptTemplate(
    input_variables=["example1_prompt", "example1_code", "example2_prompt", "example2_code", "generated_code"],
    template="""
Below are two examples of prompt-code pairs:

Example 1:
Prompt: {example1_prompt}
Code:
{example1_code}

Example 2:
Prompt: {example2_prompt}
Code:
{example2_code}

Given the code below, generate a prompt for it following the structure of the 2 examples:

Code:
{generated_code}

Generated Prompt:
"""
)

# Step 3: Initialize the LLM
llm = ChatOpenAI(model="gpt-4", temperature=0.7)

# Step 4: Function to generate prompts based on Python code
def generate_prompts_for_code(generated_code_folder, real_examples):
    generated_prompts = {}
    chain = LLMChain(llm=llm, prompt=prompt_template)

    # Iterate over all .py files in the generated code folder
    for py_file in os.listdir(generated_code_folder):
        if py_file.endswith('.py'):
            file_path = os.path.join(generated_code_folder, py_file)

            # Load the generated Python code
            with open(file_path, 'r') as f:
                generated_code = f.read()

            # Use the first two real examples as in-context examples
            example1 = real_examples[0]
            example2 = real_examples[1]

            # Prepare the inputs for the LLM
            inputs = {
                "example1_prompt": example1['prompt'],
                "example1_code": example1['code'],
                "example2_prompt": example2['prompt'],
                "example2_code": example2['code'],
                "generated_code": generated_code
            }

            # Generate the prompt for the current Python file
            result = chain.invoke(inputs)
            generated_prompt = result['text'].strip()

            # Store the generated prompt and the corresponding Python code in the dictionary
            generated_prompts[generated_prompt] = generated_code

    return generated_prompts

# Step 5: Define the folder where all generated Python files are saved
generated_code_folder = r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\generated_dataset'

# Step 6: Generate the prompts for the Python files
generated_dataset = generate_prompts_for_code(generated_code_folder, real_prompt_code_pairs)

# Step 7: Save the generated dataset (optional, saving as JSON)
with open('synthetic_dataset.json', 'w') as f:
    json.dump(generated_dataset, f, indent=4)

print("Synthetic dataset created and saved.")


Synthetic dataset created and saved.


In [7]:
### MODIFIED GENERATION ANALYZER SCRIPT

import os
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Step 1: Load and chunk the codebase, then create embeddings
def index_codebase(directory):
    # Load all .py files from the directory
    loader = DirectoryLoader(
        path=directory,
        glob="**/*.py",  # Only .py files
        exclude=["**/__pycache__/**", "**/*.pyc"]
    )
    documents = loader.load()
    
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        separators=['\n\n', '\n', ' ', '']
    )
    chunks = text_splitter.split_documents(documents)
    
    # Create embeddings for each chunk
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_documents(chunks, embeddings)
    
    return vectorstore

# Index the codebase
codebase_directory = r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\mobile_insight'
vectorstore = index_codebase(codebase_directory)

# Step 2: Load a real example analyzer code
def load_real_analyzer(file_path):
    with open(file_path, 'r') as f:
        real_analyzer_code = f.read()
    return real_analyzer_code

# Load one example real analyzer code
real_analyzer_path = r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\testcases\attach_stats\attach_stats.py'  # Update with the actual path to your real analyzer file
real_analyzer_code = load_real_analyzer(real_analyzer_path)

# Step 3: Create the prompt template for modifying the real analyzer code
prompt_template = PromptTemplate(
    input_variables=["real_analyzer_code", "retrieved_context"],
    template="""
Below are relevant parts of a Python codebase that provide useful context:

{retrieved_context}

Here is an example of an existing analyzer Python file from the codebase:

Real Analyzer Code:
{real_analyzer_code}

Using the codebase context and the real analyzer code as a reference, create a slightly modified version of the analyzer. \
    The new analyzer should perform a similar analysis but with some changes, such as adjusting metrics, altering data processing,\
    or applying a different calculation. Ensure the modified analyzer remains functional and consistent with the codebase's style and structure.

NOTE: ONLY PROVIDE PYTHON CODE, DO NOT ADD ANY OTHER TEXT BEFORE OR AFTER AS THIS OUTPUT IS BEING SAVED DIRECTLY INTO A PY FILE.

Generated Modified Analyzer:
"""
)

# Step 4: Initialize the LLM
llm = ChatOpenAI(model_name='gpt-4o', temperature=0.7)

def generate_modified_analyzers(num_examples, vectorstore, real_analyzer_code):
    modified_analyzers = []
    chain = LLMChain(llm=llm, prompt=prompt_template)
    
    for _ in range(num_examples):
        # Retrieve relevant code snippets
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
        relevant_docs = retriever.get_relevant_documents(real_analyzer_code)
        retrieved_context = "\n\n".join([doc.page_content for doc in relevant_docs])
        
        # Prepare the inputs for the LLM
        inputs = {
            "real_analyzer_code": real_analyzer_code,
            "retrieved_context": retrieved_context
        }

        # Generate the modified analyzer code
        result = chain.invoke(inputs)
        modified_analyzer_code = result['text'].strip()

        # Append the modified analyzer code to the list
        modified_analyzers.append(modified_analyzer_code)

    return modified_analyzers

In [12]:
num_examples_to_generate = 2  # Adjust as needed
real_examples_folder = r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\mobile_insight\examples'
modified_analyzers = []
i=0
for py_file in os.listdir(real_examples_folder):
    if py_file.endswith('.py'):
        file_path = os.path.join(real_examples_folder, py_file)
        with open(file_path, 'r') as f:
                real_analyzer_code = f.read()
        modified_analyzers += generate_modified_analyzers(num_examples_to_generate, vectorstore, real_analyzer_code)
        print(i)
        i +=1
        if i == 8:
             break

# Step 7: Save the modified analyzers to files (optional)
output_directory = r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\modified_generated_dataset'
os.makedirs(output_directory, exist_ok=True)

for idx, code in enumerate(modified_analyzers):
    filename = os.path.join(output_directory, f'modified_analyzer_{idx + 84 + 1}.py')
    code = code.replace("```python", "", 1)
    code = code.replace("```", "", 1)
    with open(filename, 'w') as f:
        f.write(code)
    print(f"Modified analyzer {idx + 84 + 1} saved to {filename}")

print("All modified analyzers generated and saved.")


0
1
2
3
4
5
6
7
Modified analyzer 85 saved to C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\modified_generated_dataset\modified_analyzer_85.py
Modified analyzer 86 saved to C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\modified_generated_dataset\modified_analyzer_86.py
Modified analyzer 87 saved to C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\modified_generated_dataset\modified_analyzer_87.py
Modified analyzer 88 saved to C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\modified_generated_dataset\modified_analyzer_88.py
Modified analyzer 89 saved to C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\modified_generated_dataset\modified_analyzer_89.py
Modified analyzer 90 saved to C:\Users\bhull\Desktop\U

In [13]:
### MODIFIED GENERATION PROMPTS ANALYZER PAIRS SCRIPT

import json

# Step 1: Load real prompt-code pairs from folders
def load_prompt_code_pairs(directory):
    prompt_code_pairs = []
    
    # Each subdirectory contains a prompt (txt) and code (py) file
    for folder_name in os.listdir(directory):
        if folder_name == 'logs':
            continue
        folder_path = os.path.join(directory, folder_name)
        
        if os.path.isdir(folder_path):
            # Find the txt and py files
            prompt_file = next((f for f in os.listdir(folder_path) if f.endswith('.txt')), None)
            code_file = next((f for f in os.listdir(folder_path) if f.endswith('.py')), None)
            
            if prompt_file and code_file:
                # Load the prompt
                with open(os.path.join(folder_path, prompt_file), 'r') as f:
                    prompt = f.read().strip()
                
                # Load the code
                with open(os.path.join(folder_path, code_file), 'r') as f:
                    code = f.read().strip()
                
                # Append to the list as a dict
                prompt_code_pairs.append({
                    'prompt': prompt,
                    'code': code
                })
    
    return prompt_code_pairs

# Load real prompt-code pairs
real_examples_directory = r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\testcases'
real_prompt_code_pairs = load_prompt_code_pairs(real_examples_directory)

# Step 2: Create the prompt template for generating new prompts based on Python code
prompt_template = PromptTemplate(
    input_variables=["example1_prompt", "example1_code", "example2_prompt", "example2_code", "generated_code"],
    template="""
Below are two examples of prompt-code pairs:

Example 1:
Prompt: {example1_prompt}
Code:
{example1_code}

Example 2:
Prompt: {example2_prompt}
Code:
{example2_code}

Given the code below, generate a prompt for it following the structure of the 2 examples:

Code:
{generated_code}

Generated Prompt:
"""
)

# Step 3: Initialize the LLM
llm = ChatOpenAI(model="gpt-4", temperature=0.7)

# Step 4: Function to generate prompts based on Python code
def generate_prompts_for_code(generated_code_folder, real_examples):
    generated_prompts = {}
    chain = LLMChain(llm=llm, prompt=prompt_template)

    # Iterate over all .py files in the generated code folder
    for py_file in os.listdir(generated_code_folder):
        if py_file.endswith('.py'):
            file_path = os.path.join(generated_code_folder, py_file)

            # Load the generated Python code
            with open(file_path, 'r') as f:
                generated_code = f.read()

            # Use the first two real examples as in-context examples
            example1 = real_examples[0]
            example2 = real_examples[1]

            # Prepare the inputs for the LLM
            inputs = {
                "example1_prompt": example1['prompt'],
                "example1_code": example1['code'],
                "example2_prompt": example2['prompt'],
                "example2_code": example2['code'],
                "generated_code": generated_code
            }

            # Generate the prompt for the current Python file
            result = chain.invoke(inputs)
            generated_prompt = result['text'].strip()

            # Store the generated prompt and the corresponding Python code in the dictionary
            generated_prompts[generated_prompt] = generated_code

    return generated_prompts

# Step 5: Define the folder where all generated Python files are saved
generated_code_folder = r'C:\Users\bhull\Desktop\UCLA Grad\Spring 2024\CS 219\219_final_project\LLM-assisted_mobile_trace_analysis\modified_generated_dataset'

# Step 6: Generate the prompts for the Python files
generated_dataset = generate_prompts_for_code(generated_code_folder, real_prompt_code_pairs)

# Step 7: Save the generated dataset (optional, saving as JSON)
with open('modified_synthetic_dataset.json', 'w') as f:
    json.dump(generated_dataset, f, indent=4)

print("Modified synthetic dataset created and saved.")


Modified synthetic dataset created and saved.
