## import library and data

In [5]:
# Essential libraries for dataset generation and file handling
import numpy as np
import pandas as pd
import random
import pickle
from itertools import permutations, product
import time


## Generate Random dataset for Single AIS

In [None]:
# ================================================
# Generate Random Dataset for Single Invasive Species (Zebra Mussel: zm2019)
# ================================================

your_folder = "C:/Users/hyunwoolee/OneDrive - Virginia Tech/Hyunwoo Research/BZR_EBMC"
info_data = {}

alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

# Generate 2-letter county codes (AA, AB, ..., ZZ)
counties_list = [letter1 + letter2 for letter1 in alphabet for letter2 in alphabet]
counties_list = counties_list[:100]  # Limit to 100 counties

# Experiment parameter settings
county_size_set = [2, 3, 5, 8, 10, 15, 20, 25, 30]
num_lakes_per_county_set = [50]
budget_ratio_set = [0.3, 0.5, 0.8]

# Loop through all combinations of parameters
for county_size in county_size_set:
    for num_lakes_per_county in num_lakes_per_county_set:
        for budget_ratio in budget_ratio_set:

            # === Initialize region ===
            counties = counties_list[:county_size]

            # Define infestation probabilities for zebra mussel only (others are 0)
            probabilities = {
                county: {
                    'zm2019': random.randint(1, 5) * 0.2,  # Random in {0.2, 0.4, ..., 1.0}
                    'ss2019': 0.0,
                    'ew2019': 0.0,
                    'sf2019': 0.0
                }
                for county in counties
            }

            # === Generate lakes ===
            lakes = [county + str(i) for county in counties for i in range(1, num_lakes_per_county + 1)]

            # === Create lake lists by county ===
            county_lakes = {
                county: [county + str(i) for i in range(1, num_lakes_per_county + 1)]
                for county in counties
            }

            # === Assign infestation status to each lake ===
            AIS = ['zm2019', 'ss2019', 'ew2019', 'sf2019']
            infestation_status = {lake: {} for lake in lakes}

            for county in counties:
                for species in AIS:
                    infested = random.sample(
                        county_lakes[county],
                        int(num_lakes_per_county * probabilities[county][species])
                    )
                    for lake in county_lakes[county]:
                        infestation_status[lake][species] = int(lake in infested)

            # === Compute budget per county based on infestation ===
            infested_lakes = [
                lake for lake, values in infestation_status.items()
                if any(val > 0 for val in values.values())
            ]

            county_budget = {}
            for county in counties:
                infested_in_county = [lake for lake in infested_lakes if lake[:2] == county]
                if budget_ratio == 'random':
                    ratio = random.uniform(0.1, 0.9)
                    county_budget[county] = int(ratio * len(infested_in_county))
                else:
                    county_budget[county] = int(budget_ratio * len(infested_in_county))

            # === Create origin-destination lake pairs ===
            lake_pairs = list(permutations(lakes, 2))
            data = []

            for origin_lake, destination_lake in lake_pairs:
                county_origin = origin_lake[:2]
                county_destination = destination_lake[:2]

                # Get infestation status
                vals = {
                    species + '.origin': infestation_status[origin_lake][species]
                    for species in AIS
                }
                vals.update({
                    species + '.destination': infestation_status[destination_lake][species]
                    for species in AIS
                })

                # Risky arc: origin is infested, destination is not
                risks = {
                    species + '.risky': int(vals[species + '.origin'] == 1 and vals[species + '.destination'] == 0)
                    for species in AIS
                }

                # Total number of risky species (binary sum)
                bij = sum(risks.values())

                # Assign a random travel weight
                weight = random.randint(10, 20)

                data.append([
                    origin_lake, destination_lake, weight,
                    county_origin, *[vals[s + '.origin'] for s in AIS],
                    county_destination, *[vals[s + '.destination'] for s in AIS],
                    *risks.values(), bij
                ])

            # === Create dataframe ===
            columns = [
                'dow_origin', 'dow_destination', 'weight', 'county_name.origin',
                'zm2019.origin', 'ss2019.origin', 'ew2019.origin', 'sf2019.origin',
                'county_name.destination', 'zm2019.destination', 'ss2019.destination',
                'ew2019.destination', 'sf2019.destination',
                'zm2019.risky', 'ss2019.risky', 'ew2019.risky', 'sf2019.risky', 'bij'
            ]

            df_edge = pd.DataFrame(data, columns=columns)

            # === Randomly drop a percentage of lake pairs ===
            delete_rate = random.choice([0.2])
            to_delete = df_edge.sample(n=int(len(df_edge) * delete_rate))
            df_edge = df_edge.drop(to_delete.index)

            # Keep only arcs with at least one risky species
            df_edge = df_edge[df_edge['bij'] >= 1]

            # Save the dataset and metadata
            df_edge.to_csv(
                f"{your_folder}/EBMC_generated/single_dataset_1/{county_size}_{num_lakes_per_county}_{budget_ratio}.csv"
            )

            info_data[(county_size, num_lakes_per_county, budget_ratio)] = (
                counties, num_lakes_per_county, infestation_status, county_budget
            )

            time.sleep(1.0)  # Pause to avoid file write collisions

# Save metadata dictionary
with open(f"{your_folder}/EBMC_generated/single_dataset_1/info_data.pickle", 'wb') as fw:
    pickle.dump(info_data, fw)


## Generate Random dataset for Multiple AIS

In [None]:
# ================================================
# Generate Random Dataset for Multiple Invasive Species (AIS)
# ================================================

your_folder = "C:/Users/hyunwoolee/OneDrive - Virginia Tech/Hyunwoo Research"
info_data = {}

alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

# Generate 2-letter county codes (AA, AB, ..., ZZ)
counties_list = [letter1 + letter2 for letter1 in alphabet for letter2 in alphabet]
counties_list = counties_list[:100]  # Limit to 100 counties

# Experiment parameter settings
county_size_set = [2, 3, 5, 8, 10, 15, 20, 25, 30]
num_lakes_per_county_set = [50]
budget_ratio_set = [0.3, 0.5, 0.8]

# Loop through all combinations of parameters
for county_size in county_size_set:
    for num_lakes_per_county in num_lakes_per_county_set:
        for budget_ratio in budget_ratio_set:

            # === Initialize counties and infestation probabilities ===
            counties = counties_list[:county_size]

            probabilities = {
                county: {
                    'zm2019': random.randint(1, 5) * 0.2,  # Zebra mussel
                    'ss2019': random.randint(1, 4) * 0.2,  # Spiny water flea
                    'ew2019': random.randint(1, 3) * 0.2,  # Eurasian watermilfoil
                    'sf2019': random.randint(1, 2) * 0.2   # Starry stonewort
                }
                for county in counties
            }

            # === Generate lakes ===
            lakes = [county + str(i) for county in counties for i in range(1, num_lakes_per_county + 1)]
            county_lakes = {
                county: [county + str(i) for i in range(1, num_lakes_per_county + 1)]
                for county in counties
            }

            # === Assign infestation status to each lake ===
            AIS = ['zm2019', 'ss2019', 'ew2019', 'sf2019']
            infestation_status = {lake: {} for lake in lakes}

            for county in counties:
                for species in AIS:
                    infested = random.sample(
                        county_lakes[county],
                        int(num_lakes_per_county * probabilities[county][species])
                    )
                    for lake in county_lakes[county]:
                        infestation_status[lake][species] = int(lake in infested)

            # === Compute budget per county ===
            infested_lakes = [
                lake for lake, values in infestation_status.items()
                if any(val > 0 for val in values.values())
            ]

            county_budget = {}
            for county in counties:
                infested_in_county = [lake for lake in infested_lakes if lake[:2] == county]
                if budget_ratio == 'random':
                    ratio = random.uniform(0.1, 0.9)
                    county_budget[county] = int(ratio * len(infested_in_county))
                else:
                    county_budget[county] = int(budget_ratio * len(infested_in_county))

            # === Create origin-destination lake pairs ===
            lake_pairs = list(permutations(lakes, 2))
            data = []

            for origin_lake, destination_lake in lake_pairs:
                county_origin = origin_lake[:2]
                county_destination = destination_lake[:2]

                # Get infestation values
                vals = {
                    species + '.origin': infestation_status[origin_lake][species]
                    for species in AIS
                }
                vals.update({
                    species + '.destination': infestation_status[destination_lake][species]
                    for species in AIS
                })

                # Risky if origin is infested and destination is not
                risks = {
                    species + '.risky': int(vals[species + '.origin'] == 1 and vals[species + '.destination'] == 0)
                    for species in AIS
                }

                # Total number of risky species
                bij = sum(risks.values())

                # Random weight for travel intensity
                weight = random.randint(10, 20)

                data.append([
                    origin_lake, destination_lake, weight,
                    county_origin, *[vals[s + '.origin'] for s in AIS],
                    county_destination, *[vals[s + '.destination'] for s in AIS],
                    *risks.values(), bij
                ])

            # === Create dataframe ===
            columns = [
                'dow_origin', 'dow_destination', 'weight', 'county_name.origin',
                'zm2019.origin', 'ss2019.origin', 'ew2019.origin', 'sf2019.origin',
                'county_name.destination', 'zm2019.destination', 'ss2019.destination',
                'ew2019.destination', 'sf2019.destination',
                'zm2019.risky', 'ss2019.risky', 'ew2019.risky', 'sf2019.risky', 'bij'
            ]

            df_edge = pd.DataFrame(data, columns=columns)

            # === Randomly drop a portion of rows ===
            delete_rate = random.choice([0.2])
            to_delete = df_edge.sample(n=int(len(df_edge) * delete_rate))
            df_edge = df_edge.drop(to_delete.index)

            # Keep only risky edges
            df_edge = df_edge[df_edge['bij'] >= 1]

            # === Save results ===
            df_edge.to_csv(
                f"{your_folder}/EBMC_generated/multi_dataset_1/{county_size}_{num_lakes_per_county}_{budget_ratio}.csv"
            )

            info_data[(county_size, num_lakes_per_county, budget_ratio)] = (
                counties, num_lakes_per_county, infestation_status, county_budget
            )

            time.sleep(1.0)

# Save metadata dictionary
with open(f"{your_folder}/EBMC_generated/multi_dataset_1/info_data.pickle", 'wb') as fw:
    pickle.dump(info_data, fw)
