# Human Like Quesitons

In [7]:
import os
from openai import OpenAI

openai_api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI()

#### Load Existing Questons

In [8]:
from src.gdrive_api.utils import extract_questions, find_and_load_all_problems

parent_folder_path = './notebooks'

all_problems = find_and_load_all_problems(parent_folder_path)
questions = extract_questions(all_problems)
existing_questions = list(questions)
existing_questions


['How can I resample a time series to a monthly frequency using pandas?',
 "In Python's unittest framework, how can I test if two lists contain the same elements, regardless of their order?",
 "I'm trying to simulate projectile motion, but my function only calculates the horizontal distance. Could you help me extend it to also calculate the maximum height reached? Here's what I have so far: \n```python\ndef projectile_distance(velocity, angle):\n    import math\n    g = 9.81\n    distance = (velocity ** 2) * math.sin(2 * math.radians(angle)) / g\n    return distance\n```",
 'Can you show me how to implement a basic greedy algorithm for coin change in Python?',
 'Could you show me the Python code to calculate the coefficients for a linear regression?',
 "Hey, I want to ensure my test cases are clean and maintainable. Can you update my test method to use a setup function for the common test data? Here's the current test: ```class TestMathOperations(unittest.TestCase): def test_multiply(s

#### Load Topic

In [9]:
import json

# open json file "topic_hierarchy.json" into a dict
with open('topic_hierarchy.json') as json_file:
    topic_hierarchy = json.load(json_file)

# Crawl the keys recursively and concatenate each unique path into a separate string and return a list of all unique paths
def crawl_keys(d, sep=' > ', prefix=''):
    paths = []
    for k, v in d.items():
        path = prefix + k
        if isinstance(v, dict) and len(v.keys()) > 0:
            paths.extend(crawl_keys(v, sep, path + sep))
        else:
            paths.append(path)
    return paths

all_topics = crawl_keys(topic_hierarchy)
print(f"Total number of topics: {len(all_topics)}")
all_topics

Total number of topics: 132


['algorithms > by_data_structure > arrays',
 'algorithms > by_data_structure > linked_lists',
 'algorithms > by_data_structure > stacks',
 'algorithms > by_data_structure > queues',
 'algorithms > by_data_structure > trees',
 'algorithms > by_data_structure > graphs',
 'algorithms > by_data_structure > hash_tables',
 'algorithms > by_data_structure > heaps',
 'algorithms > by_data_structure > strings',
 'algorithms > by_data_structure > advanced_data_structures',
 'algorithms > by_topic > dynamic_programming',
 'algorithms > by_topic > famous_algorithms',
 'algorithms > by_topic > greedy_algorithms',
 'algorithms > by_topic > recursion',
 'algorithms > by_topic > searching',
 'algorithms > by_topic > sorting',
 'algorithms > by_topic > math',
 'algorithms > by_topic > bit_manipulation',
 'algorithms > by_topic > geometry',
 'algorithms > by_topic > probability',
 'algorithms > by_topic > game_theory',
 'algorithms > by_topic > divide_and_conquer',
 'algorithms > by_topic > backtracking

#### Max Questions to generate

In [10]:
MAX_QUESTIONS = 20  # The maximum number of questions
generated_questions_count = 0  # Counter for the number of questions generated so far

In [11]:

from src.gdrive_api.utils import generate_human_like_questions
from tqdm import tqdm

problems = []
with tqdm(total=MAX_QUESTIONS) as pbar:
    for topic in all_topics:
        # Stop if we've reached the max limit
        if generated_questions_count >= MAX_QUESTIONS:
            break
        # Generate questions
        questions = generate_human_like_questions(topic, 5, existing_questions)
        for question in questions["questions"]:
            # If we're at the max, break
            if generated_questions_count >= MAX_QUESTIONS:
                break
            problems.append({
                "metadata": {
                    "topic": topic,
                    "type": "query",
                    "difficulty": "Easy",
                    "target_length": 1
                },
                "messages": [
                    {"role": "user", "content": question},
                ]
            })
            generated_questions_count += 1  # Increment for each question
            pbar.update(1)

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [01:45<00:00,  5.26s/it]


In [12]:
from src.gdrive_api.utils import generate_human_like_code_modification_requests


with tqdm(total=MAX_QUESTIONS) as pbar:
    for topic in all_topics:
        # Stop if we've reached the max limit
        if generated_questions_count >= MAX_QUESTIONS:
            break
        questions = generate_human_like_code_modification_requests(topic, 3, existing_questions)
        for question in questions["questions"]:
            # If we're at the max, break
            if generated_questions_count >= MAX_QUESTIONS:
                break
            problems.append({
                "metadata": {
                    "topic": topic,
                    "type": "modification",
                    "difficulty": "Easy",
                    "target_length": 1
                },
                "messages": [
                    {"role": "user", "content": question},
                ]
            })
        generated_questions_count += 1  # Increment for each question
        pbar.update(1)

  0%|          | 0/20 [00:00<?, ?it/s]


In [13]:
problem_titles = []
problem_topic_counts = {}
file_path_to_problem = {}
for problem in problems:
    topic_type = f'{problem["metadata"]["topic"].split(" > ")[-1]}__{problem["metadata"]["type"]}'
    idx = problem_topic_counts.get(topic_type, 0)
    title = f'{topic_type}__{idx}'
    problem_titles.append(title)
    file_path_to_problem[f"{title}.ipynb"] = problem
    problem_topic_counts[topic_type] = idx + 1

In [14]:
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell
import re

for p, t in zip(problems, problem_titles):
    # Determine the parity based on the last digit in the title
    title_number = int(re.search(r'\d+$', t).group())  # Extract the last number from the title
    parity = title_number % 2

    # Create a new notebook
    notebook = new_notebook()

    # Add metadata
    metadata = f"""# Metadata

**Python Topics** - {p["metadata"]["topic"]}

**Type** - {p["metadata"]["type"]}

**Target Number of Turns (User + Assistant)** - 2+
"""
    metadata_cell = new_markdown_cell(metadata)
    notebook.cells.append(metadata_cell)

    # Add conversation header
    conversation_header = "# Conversation"
    conversation_header_cell = new_markdown_cell(conversation_header)
    notebook.cells.append(conversation_header_cell)

    # Append conversation messages
    title = "**User**" if parity else "**Assistant**"
    for message in p["messages"]:
        msg_content = f"""{title}

{message["content"]}
"""
        conversation_message_cell = new_markdown_cell(msg_content)
        notebook.cells.append(conversation_message_cell)

    # Save the notebook
    notebook_path = f'notebooks/test/{t}.ipynb'
     
    try:
        os.makedirs(os.path.dirname(notebook_path))
    except FileExistsError:
        pass
     
    with open(notebook_path, 'w') as f:
        nbformat.write(notebook, f)

In [15]:
import json

with open('notebooks/test/problems.json', 'w') as f:
    json.dump(problems, f, indent=4)

In [16]:
with open('notebooks/test/problems.json') as f:
    problems = json.load(f)

In [17]:
from tqdm import tqdm
from src.gdrive_api.folder_upload import upload_folder
from src.gdrive_api.auth import build_service

service = build_service('creds/google__sa.json')
# destination_folder_url = "https://drive.google.com/drive/u/0/folders/1Z1bdYMe2Qmo_vs-OaKDaYIiV3rIqLJH9"
# destination_folder_url = "https://drive.google.com/drive/u/2/folders/1sfPFHkXYpKyY41V0pfz3Qw3k4VLy5Hvb"
destination_folder_url = 'https://drive.google.com/drive/folders/1T14cHENpx7j2aS4Yb_3WhnXhMql0zq2y'

uploaded_files = upload_folder(service, 'notebooks/test', destination_folder_url, force_replace = True, is_url=True)

file_path_to_url = {}
with tqdm(total=len(uploaded_files)) as pbar:
    for file_path, file_url in uploaded_files.items():
        if file_url is not None:
            drive_id = file_url.split("id=")[-1].split("&")[0].strip()
            colab_url = f"https://colab.research.google.com/drive/{drive_id}"
            file_path_to_url[file_path] = colab_url
        else:
            print(f"Skipped uploading {file_path}")
        pbar.update(1)

for file_path in file_path_to_url.keys():
    if file_path == "problems.json":
        continue
    problem = file_path_to_problem[file_path]
    problem["metadata"]["colab_url"] = file_path_to_url[file_path]
    problem["metadata"]["file_path"] = file_path
    problem["metadata"]["batch_idx"] = "test"

------------------------------------------------------------
Processing directory .: 1 of 0 in total.
Uploading file 1 of 21 in '.', 1 of 21 in total.
Replacing existing file 'arrays__query__0.ipynb' with the new version.
File 'arrays__query__0.ipynb' has been replaced.
Uploaded 'arrays__query__0.ipynb' to folder ID '1T14cHENpx7j2aS4Yb_3WhnXhMql0zq2y'.
arrays__query__0.ipynb
Uploading file 2 of 21 in '.', 2 of 21 in total.
Replacing existing file 'arrays__query__1.ipynb' with the new version.
File 'arrays__query__1.ipynb' has been replaced.
Uploaded 'arrays__query__1.ipynb' to folder ID '1T14cHENpx7j2aS4Yb_3WhnXhMql0zq2y'.
arrays__query__1.ipynb
Uploading file 3 of 21 in '.', 3 of 21 in total.
Replacing existing file 'arrays__query__2.ipynb' with the new version.
File 'arrays__query__2.ipynb' has been replaced.
Uploaded 'arrays__query__2.ipynb' to folder ID '1T14cHENpx7j2aS4Yb_3WhnXhMql0zq2y'.
arrays__query__2.ipynb
Uploading file 4 of 21 in '.', 4 of 21 in total.
Replacing existing fi

100%|██████████| 21/21 [00:00<?, ?it/s]


In [18]:
problems = list(file_path_to_problem.values())

with open('notebooks/test/problems.json', 'w') as f:
    json.dump(problems, f, indent=4)

In [19]:
# from google.oauth2 import service_account
# from googleapiclient.discovery import build

# # Path to your service account key file
# SERVICE_ACCOUNT_FILE = 'creds/google__sa.json'

# # The scopes required for the Sheets API
# SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

# # The ID of your spreadsheet
# SPREADSHEET_ID = '1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4'


# # Authenticate and build the service
# creds = service_account.Credentials.from_service_account_file(
#         SERVICE_ACCOUNT_FILE, scopes=SCOPES)
# service = build('sheets', 'v4', credentials=creds)

# # Specify the range and values to update
# range_ = 'Conversations_Batch_3!A2:E610'  # For example, this updates cells from A1 to D5 in Sheet1
# values = []

# for problem in problems:
#     values.append([
#         problem["metadata"]["colab_url"],
#         problem["metadata"]["topic"],
#         problem["metadata"]["target_length"],
#         problem["metadata"]["type"],
#         problem["metadata"]["batch_idx"],
#     ])


# body = {
#     'values': values
# }

# # Call the Sheets API to update the range
# request = service.spreadsheets().values().update(spreadsheetId=SPREADSHEET_ID, range=range_, valueInputOption='RAW', body=body)
# response = request.execute()