In [6]:
import pandas as pd
import csv


In [7]:
data = pd.read_csv("GeoAnalystBench.csv")
data.head()

Unnamed: 0,id,Open Source,Task Categories1,Task Categories2,Task Categories3,Task,Instruction,Domain Knowledge,Dataset Description,Human Designed Workflow,Task Length,CodeString
0,1,T,Making predictions,Understanding where,,Find heat islands and at-risk populations in M...,Your task is analyzing urban heat using Krigin...,Kriging is a commonly used spatial interpolati...,dataset/Temperature.geojson: Geojson file that...,1. Load dataset\n2. Interpolate(Temperature)\n...,7,import numpy as np\nimport geopandas as gpd\nf...
1,2,T,Detecting and quantifying patterns,Finding the best locations and paths,,"Find future bus stop locations in Hamilton, Te...",Your task is performing analysis on public tra...,The Overlay toolset contains tools to overlay ...,dataset/BusServiceArea.geojson: Geojson file s...,1. Load dataset\n2. Filter(Poverty)\n3. Filter...,6,import geopandas as gpd\nimport matplotlib.pyp...
2,3,T,Understanding where,Detecting and quantifying patterns,,Assess burn scars and understanding the impact...,Your task is assessing burn scars using satell...,Normalized Burn Ratio (NBR) is used to identif...,dataset/G_2014.tif: Raster file of satellite i...,1. Load dataset\n2. Filter(2014 bands)\n3. Fil...,6,import rasterio\nimport numpy as np\nimport ma...
3,4,F,Understanding where,Finding the best locations and paths,,Identify groundwater vulnerable areas that nee...,Your task is identifying groundwater vulnerabl...,Suitability modeling is an analytical process ...,"dataset/mc_soils.shp: In this shapefile, three...",1. Load dataset\n2. Project (shapefile)\n3. Po...,10,import arcpy\nfrom arcpy.sa import *\nfrom arc...
4,5,F,Detecting and quantifying patterns,,,Visualize the data about children with elevate...,Your task is visualizing data about children w...,Hot spot analysis is based on mathematical cal...,High_Blood_Level_Results.shp: This shapefile c...,1.Load dataset\n2. Perform Optimized Hot Spot ...,5,import arcpy\n\n# Set up the input shapefiles ...


In [8]:
# Use the first task as an example
ID = 1 # Use your own task id

row = data.iloc[ID-1]
if row['Open Source'] == 'T':
  Arcpy = False
else:
  Arcpy = True
row

id                                                                         1
Open Source                                                                T
Task Categories1                                          Making predictions
Task Categories2                                         Understanding where
Task Categories3                                                         NaN
Task                       Find heat islands and at-risk populations in M...
Instruction                Your task is analyzing urban heat using Krigin...
Domain Knowledge           Kriging is a commonly used spatial interpolati...
Dataset Description        dataset/Temperature.geojson: Geojson file that...
Human Designed Workflow    1. Load dataset\n2. Interpolate(Temperature)\n...
Task Length                                                                7
CodeString                 import numpy as np\nimport geopandas as gpd\nf...
Name: 0, dtype: object

In [10]:
def add_line_breaks(long_string, char_limit=80): #limit char from each line to 80
  words = long_string.split()
  new_string = ""
  char_count = 0
  for word in words:
    new_string += word + " "
    char_count += len(word)
    if char_count > char_limit:
      new_string += "\n"
      char_count = 0
  return new_string

def long_line_break(long_string): #limit string for being too long per line
  result = ""

  if isinstance(long_string, str):
    for line in long_string.split("\n"):
      new_line = add_line_breaks(line)
      result += new_line + "\n"
  else:
    result = str(long_string)
  return result


task = row["Task"]
instruction = row["Instruction"]
domainKnowledge = row["Domain Knowledge"]
dataset = row["Dataset Description"]

instruction = add_line_breaks(instruction)
task = add_line_breaks(task)
domainKnowledge = add_line_breaks(domainKnowledge)
dataset = long_line_break(dataset)


In [13]:
def workflowTemplate(IDs=None, tasks=None, instructions=None, zeroShot=False, domainKnowledges=None, datasets=None):
  if task is None and instruction is None:
    print("Task or Instruction is necessary")
    return None
  prompt = {
    "Task": tasks,
    "Instruction": instructions,
    "Domain Knowledge": domainKnowledges,
    "Dataset Description": datasets
  }

  template = """As a Geospatial data scientist, you will generate a workflow to a proposed task.\n""" #Define role
  for key, value in prompt.items():
    if value is not None:
      template += f"\n[{key}]: \n{value}" #include information

  #one shot sample
  sample = """ \n\"\"\"
  tasks = [“task1”, “task2”, “task3”]

  G = nx.DiGraph()
  for i in range(len(tasks) - 1):
      G.add_edge(tasks[i], tasks[i + 1])
  pos = nx.drawing.nx_pydot.graphviz_layout(G, prog="dot")
  plt.figure(figsize=(15, 8))
  nx.draw(G, pos, with_labels=True, node_size=3000, node_color='lightblue', font_size=10, font_weight='bold', arrowsize=20)
  plt.title("Workflow for Analyzing Urban Heat Using Kriging Interpolation", fontsize=14)
  plt.show()\n\"\"\"
  """

  #Key Notes
  template += '\n[Key Notes]:'
  template += '\n1.Use **automatic reasoning** and clearly explain each step (Chain of Thoughts approach).'
  template += '\n2.Using **NetworkX* package for visualization.'
  template += '\n3.Using \'dot\' for graph visualization layout.'
  template += '\n4.Multiple subtasks can be proceeded correspondingly because'
  template += '\nall of their outputs will be inputs for the next subtask.'
  template += "\n5.Limiting your output to code, no extra information."
  template += '\n6.Only codes for workflow, no implementation.'
  template += '\n'
  if zeroShot is False:
    template += "\n[Expected Sample Output Begin]"
    template += "\n" + sample
    template += "[Expected Sample Output End]"
  return template


In [11]:
def codeTemplate(IDs=None, tasks=None, instructions=None, zeroShot=False, domainKnowledges=None, datasets=None, Arcpy=False):
  if task is None and instruction is None:
    print("Task or Instruction is necessary")
    return None
  prompt = {
    "Task": tasks,
    "Instruction": instructions,
    "Domain Knowledge": domainKnowledges,
    "Dataset Description": datasets
  }

  template = """As a Geospatial data scientist, generate a python file to solve the proposed task.\n"""
  for key, value in prompt.items():
    if value is not None:
      template += f"\n[{key}]: \n{value}"

  sample = """ \"\"\"
    import packages

    def main():
      path = "path"
      data = loaddata()
      #code for subtask1
      #code for subtask2
      #code for final task

    if __name__ == "__main__":
      main()
  \"\"\"
  """
  template += '\n\n[Key Notes]:'
  template += '\n1.Use **automatic reasoning** and clearly explain each subtask before performing it (ReAct approach).'
  template += '\n2.Using latest python packages for code generation'
  template += "\n3.Put all code under main function, no helper functions"
  template += "\n4.Limit your output to code, no extra information."
  if Arcpy is True:
    template += "\n5.Use latest **Arcpy** functions only"
  else:
    template += "\n5.Use latest open source python packages only"
  template += '\n'
  if zeroShot is False:
    template += "\n[Expected Sample Output Begin]"
    template += "\n" + sample
    template += "[Expected Sample Output End]"
  return template

In [14]:
sample_code_prompt = codeTemplate(tasks=task, instructions=instruction, zeroShot=True, Arcpy=False)
sample_workflow_prompt = workflowTemplate(tasks=task, instructions=instruction, zeroShot=False)

print(sample_code_prompt)

As a Geospatial data scientist, generate a python file to solve the proposed task.

[Task]: 
Find heat islands and at-risk populations in Madison, Wisconsin 
[Instruction]: 
Your task is analyzing urban heat using Kriging interpolation techniques in Python. The analysis 
should focus on understanding spatial patterns of urban heat islands by using point temperature 
data and interpolating these values across a city. You will have to use a demographic layer to extract 
and enhance the data visualization on the elder group(Age>65). The goal is to load the temperature 
sample data, apply the Kriging method to predict temperature across the urban area, and generate 
a choropleth map showing the average interpolated temperature surface in each census block group. 
Highlighting the area with high interpolated area as well as high density of the elder population. 
The final output should be saved as "pred_results/interpolated_urban_heat.png". 

[Key Notes]:
1.Use **automatic reasoning** and c

In [17]:
#Generate prompts for first 50 tasks
with open('code_prompts.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['task_id', 'type', 'domain_knowledge', 'dataset', 'Arcpy', 'prompt_content'])

with open('workflow_prompts.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['task_id', 'type', 'domain_knowledge', 'dataset', 'Arcpy', 'prompt_content'])

for id in range(50):
    row = data.iloc[id]
    task = row["Task"]
    instruction = row["Instruction"]
    domainKnowledge = row["Domain Knowledge"]
    dataset = row["Dataset Description"]
    if row["Open Source"] != 'T':
        Arcpy = True
    else:
        Arcpy = False

    instruction = add_line_breaks(instruction)
    task = add_line_breaks(task)
    domainKnowledge = add_line_breaks(domainKnowledge)
    dataset = long_line_break(dataset)

    # Generate prompts with different combinations of domain knowledge and dataset descriptions
    combinations = [ #each task has 4 combinations of domain knowledge and dataset descriptions
        (False, False),
        (True, False),
        (False, True),
        (True, True)
    ]

    for domain, dataset_included in combinations:
        # Build parameters for template functions
        # Build params for code template
        code_params = {
            'tasks': task,
            'instructions': instruction,
            'zeroShot': True,
            'Arcpy': Arcpy
        }

        # Build params for workflow template (without Arcpy)
        workflow_params = {
            'tasks': task,
            'instructions': instruction,
            'zeroShot': False
        }

        if domain:
            code_params['domainKnowledges'] = domainKnowledge
            workflow_params['domainKnowledges'] = domainKnowledge
        if dataset_included:
            code_params['datasets'] = dataset
            workflow_params['datasets'] = dataset

        # Generate code and workflow prompts
        code_prompt = codeTemplate(**code_params)
        workflow_prompt = workflowTemplate(**workflow_params)

        # Write to CSV files
        with open('code_prompts.csv', 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([id+1, 'code', domain, dataset_included, Arcpy, code_prompt])
        with open('workflow_prompts.csv', 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([id+1, 'workflow', domain, dataset_included, Arcpy, workflow_prompt])
