In [3]:
import pandas as pd

In [10]:
import pandas as pd
import re

# File path
file_path = 'AdjacentCounties'

# Reading the file
with open(file_path, 'r') as file:
    data = file.read()

# Initialize an empty list to store the processed data
county_data = []
current_county = None
current_code = None

# Process each line in the file data
for line in data.strip().splitlines():
    line = line.strip()
    if not line:
        continue

    # Match for main county lines (non-indented) using regex for pattern consistency
    match_main = re.match(r'"(.+?), IN"\s+(\d+)\s+".+?"\s+(\d+)', line)
    if match_main:
        # Extract the main county details only if it is an Indiana county
        current_county = match_main.group(1)
        current_code = match_main.group(2)
    else:
        # Process indented lines for adjacent counties
        match_adjacent = re.match(r'"\s*(.+?)"\s+(\d+)', line)
        if match_adjacent and current_county and current_code:
            adjacent_county = match_adjacent.group(1)
            adjacent_code = match_adjacent.group(2)
            county_data.append([current_county, current_code, adjacent_county, adjacent_code])

# Create DataFrame with Indiana counties only
df = pd.DataFrame(county_data, columns=["County", "County_Code", "Adjacent_County", "Adjacent_County_Code"])

# Get DataFrame with Indiana counties only in the Adjacent_County column
df = df[df["Adjacent_County"].str.endswith(", IN")]
df["Adjacent_County"] = df["Adjacent_County"].str.replace(", IN", "")

In [11]:
adjacent = df

In [12]:
adjacent.head()

Unnamed: 0,County,County_Code,Adjacent_County,Adjacent_County_Code
0,Adams County,18001,Allen County,18003
1,Adams County,18001,Jay County,18075
2,Adams County,18001,Wells County,18179
5,Allen County,18003,Allen County,18003
6,Allen County,18003,DeKalb County,18033


In [13]:
# Load populations
# Reading the data from the specified file 'county_data.csv'

# File path
file_path = 'county_data.csv'

# Reading the data from the file
pop_df = pd.read_csv(file_path)

# Add ' County' to end of name column
pop_df['name'] = pop_df['name'] + ' County'

pop_df.head()

Unnamed: 0,id,name,name_ascii,type,county_fips,state_id,state_name,lat,lng,population,metadata_tag
0,1,Whitley County,Whitley,County,18183,IN,Indiana,41.1394,-85.5051,34048,0
1,2,White County,White,County,18181,IN,Indiana,40.7498,-86.8655,24593,0
2,3,Wells County,Wells,County,18179,IN,Indiana,40.7292,-85.2212,28103,0
3,4,Wayne County,Wayne,County,18177,IN,Indiana,39.8644,-85.0098,66588,0
4,5,Washington County,Washington,County,18175,IN,Indiana,38.6,-86.1053,28025,0


In [42]:
# Sum of all populations divide by 9 (number of counties in Indiana)
ideal_population = pop_df['population'].sum() / 9

# Calculate the min and max based on 5% tolerance
min_population = ideal_population - (ideal_population * 0.1)
max_population = ideal_population + (ideal_population * 0.1)

ideal_population, min_population, max_population

(750148.8888888889, 675134.0, 825163.7777777778)

In [15]:
import pandas as pd

# Merge to add main county data (lat, lng, population)
adjacent = pd.merge(
    adjacent, 
    pop_df[['name', 'lat', 'lng', 'population']], 
    left_on='County', 
    right_on='name', 
    how='left'
).rename(columns={'lat': 'main_lat', 'lng': 'main_lng', 'population': 'main_population'})

# Drop the extra 'name' column
adjacent = adjacent.drop(columns=['name'])

# Merge again to add adjacent county data (lat, lng, population)
adjacent = pd.merge(
    adjacent, 
    pop_df[['name', 'lat', 'lng', 'population']], 
    left_on='Adjacent_County', 
    right_on='name', 
    how='left'
).rename(columns={'lat': 'adj_lat', 'lng': 'adj_lng', 'population': 'adj_population'})

# Drop the extra 'name' column from the second merge
adjacent = adjacent.drop(columns=['name'])

# Display the updated DataFrame
adjacent.head()


Unnamed: 0,County,County_Code,Adjacent_County,Adjacent_County_Code,main_lat,main_lng,main_population,adj_lat,adj_lng,adj_population
0,Adams County,18001,Allen County,18003,40.7457,-84.9366,35685,41.0909,-85.0666,381839
1,Adams County,18001,Jay County,18075,40.7457,-84.9366,35685,40.438,-85.0057,20570
2,Adams County,18001,Wells County,18179,40.7457,-84.9366,35685,40.7292,-85.2212,28103
3,Allen County,18003,Allen County,18003,41.0909,-85.0666,381839,41.0909,-85.0666,381839
4,Allen County,18003,DeKalb County,18033,41.0909,-85.0666,381839,41.3976,-84.9991,43059


In [16]:
from math import pi, sin, cos, asin, sqrt

def degrees_to_radians(x):
     return((pi/180)*x)
     
def lon_lat_distance_miles(lon_a,lat_a,lon_b,lat_b):
    radius_of_earth = 24872/(2*pi)
    c = sin((degrees_to_radians(lat_a) - \
    degrees_to_radians(lat_b))/2)**2 + \
    cos(degrees_to_radians(lat_a)) * \
    cos(degrees_to_radians(lat_b)) * \
    sin((degrees_to_radians(lon_a) - \
    degrees_to_radians(lon_b))/2)**2
    return(2 * radius_of_earth * (asin(sqrt(c))))    

def lon_lat_distance_meters (lon_a,lat_a,lon_b,lat_b):
    return(lon_lat_distance_miles(lon_a,lat_a,lon_b,lat_b) * 1609.34) 
    

adjacent['distance_miles'] = adjacent.apply(
    lambda row: lon_lat_distance_miles(row['main_lat'], row['main_lng'], row['adj_lat'], row['adj_lng']),
    axis=1
)

In [None]:
adjacent.head()

Unnamed: 0,County,County_Code,Adjacent_County,Adjacent_County_Code,main_lat,main_lng,main_population,adj_lat,adj_lng,adj_population,distance_miles
0,Adams County,18001,Allen County,18003,40.7457,-84.9366,35685,41.0909,-85.0666,381839,9.218758
1,Adams County,18001,Jay County,18075,40.7457,-84.9366,35685,40.438,-85.0057,20570,5.124827
2,Adams County,18001,Wells County,18179,40.7457,-84.9366,35685,40.7292,-85.2212,28103,19.662941
3,Allen County,18003,Allen County,18003,41.0909,-85.0666,381839,41.0909,-85.0666,381839,0.0
4,Allen County,18003,DeKalb County,18033,41.0909,-85.0666,381839,41.3976,-84.9991,43059,5.011402


In [76]:
county_pop = adjacent[['County_Code', 'main_population']]
county_pop = county_pop.set_index('County_Code')['main_population'].to_dict()
county_pop

{'18001': 35685,
 '18003': 381839,
 '18005': 81759,
 '18007': 8687,
 '18009': 12139,
 '18011': 69839,
 '18013': 15444,
 '18015': 20288,
 '18017': 37918,
 '18019': 120185,
 '18021': 26397,
 '18023': 33010,
 '18025': 10511,
 '18027': 33281,
 '18029': 50494,
 '18031': 26466,
 '18033': 43059,
 '18035': 112480,
 '18037': 43474,
 '18039': 206314,
 '18041': 23393,
 '18043': 79594,
 '18045': 16422,
 '18047': 22769,
 '18049': 20400,
 '18051': 33017,
 '18053': 66802,
 '18055': 30924,
 '18057': 341616,
 '18059': 78616,
 '18061': 39516,
 '18063': 172100,
 '18065': 48857,
 '18067': 83349,
 '18069': 36572,
 '18071': 45948,
 '18073': 33006,
 '18075': 20570,
 '18077': 33000,
 '18079': 27619,
 '18081': 159739,
 '18083': 36362,
 '18085': 80151,
 '18087': 40085,
 '18089': 495925,
 '18091': 112184,
 '18093': 45133,
 '18095': 130037,
 '18097': 969542,
 '18099': 46175,
 '18101': 9885,
 '18103': 36100,
 '18105': 140189,
 '18107': 37967,
 '18109': 71394,
 '18111': 13865,
 '18113': 47293,
 '18115': 5931,
 '181

In [None]:
# Initialize an empty dictionary to hold the result
dict = {}

# Iterate over each row in the DataFrame
for _, row in adjacent.iterrows():
    # Extract main county information
    main_county = row['County_Code']
    
    # Calculate distance (assuming you have a function to do this)
    distance = lon_lat_distance_miles(row['main_lng'], row['main_lat'], row['adj_lng'], row['adj_lat'])
    
    # Structure for each adjacent county
    adjacent_info = {
        'Adjacent_County_Code': row['Adjacent_County_Code'],
        'Distance': distance
    }
    
    # If the main county is not yet a key in the dictionary, add it
    if main_county not in dict:
        dict[main_county] = []
    
    # Append the adjacent county info to the list for this county
    dict[main_county].append(adjacent_info)

# Display the adjacency dictionary
dict

{'18001': [{'Adjacent_County_Code': '18003', 'Distance': 24.796349193706693},
  {'Adjacent_County_Code': '18075', 'Distance': 21.565539067531635},
  {'Adjacent_County_Code': '18179', 'Distance': 14.9421243660823}],
 '18003': [{'Adjacent_County_Code': '18003', 'Distance': 0.0},
  {'Adjacent_County_Code': '18033', 'Distance': 21.477734667532314},
  {'Adjacent_County_Code': '18069', 'Distance': 28.46955246430768},
  {'Adjacent_County_Code': '18113', 'Distance': 28.00369041650233},
  {'Adjacent_County_Code': '18179', 'Distance': 26.260837334188672},
  {'Adjacent_County_Code': '18183', 'Distance': 23.06892348165851}],
 '18005': [{'Adjacent_County_Code': '18013', 'Distance': 17.67019988610346},
  {'Adjacent_County_Code': '18031', 'Distance': 22.329820068143857},
  {'Adjacent_County_Code': '18071', 'Distance': 22.017787257580256},
  {'Adjacent_County_Code': '18079', 'Distance': 20.436152828816397},
  {'Adjacent_County_Code': '18081', 'Distance': 22.445119008847804},
  {'Adjacent_County_Code':

# Build Linear Programming Problem



Our objective is to maximize compactness (minimize distance between counties in the same district). Our constraints are:

• Each county must be in exactly one district.  
• The population of each district must be within 5% of the ideal population (the mean of the total population divided by the number of districts). These are currently stored in ideal_population, min_population, max_population variables. 
• The number of districts must be 9 (the current number of districts in Indiana).  

In [121]:
# Import PuLP library
from pulp import LpVariable, LpBinary, LpProblem, LpMinimize, lpSum, LpStatus

# Define the number of districts
num_districts = 9

# Initialize the model
model = LpProblem("Indiana Counties", LpMinimize)

# Create binary variables for county-district assignments
district_assignments = {
    county: {d: LpVariable(f"assign_{county}_{d}", cat="Binary") for d in range(1, num_districts + 1)}
    for county in dict.keys()}

# Objective function: minimize the total distance between counties in the same district
model += lpSum(
    district_assignments[county][d] * adj['Distance']
    for county, adjacents in dict.items()
    for adj in adjacents
    for d in range(1, num_districts + 1)
    if adj['Adjacent_County_Code'] in district_assignments
)

# Constraint: each county must be in exactly one district
for county in dict.keys():
    model += lpSum(district_assignments[county][d] for d in range(1, num_districts + 1)) == 1

# Calculate total population and ideal population per district
total_population = sum(county_pop.values())
ideal_population = total_population / num_districts
min_population = 0.7 * ideal_population
max_population = 1.3 * ideal_population

# Constraint: population of each district must be within 30% of the ideal population
for d in range(1, num_districts + 1):
    model += lpSum(
        county_pop[county] * district_assignments[county][d]
        for county in dict.keys()
    ) >= min_population
    model += lpSum(
        county_pop[county] * district_assignments[county][d]
        for county in dict.keys()
    ) <= max_population

# ADDITIONAL ADJACENCY CONSTRAINT: counties in the same district must be adjacent ** we tried this but it caused the solver to output zero rows **
#for county, adjacents in dict.items():
    #for adj in adjacents:
        #for d in range(1, num_districts + 1):
            #model += district_assignments[county][d] <= district_assignments[adj['Adjacent_County_Code']][d]

# Solve the model
model.solve()

# Initialize an empty list to store the results
results = []

# Print the results
for county in dict.keys():
    for d in range(1, num_districts + 1):
        if district_assignments[county][d].varValue == 1:
            print(f"County {county} is assigned to district {d}")
            results.append({'County': county, 'District': d})
        
# Convert the results to a DataFrame
results_df = pd.DataFrame(results)
# Display the DataFrame
print(results_df)



Welcome to the CBC MILP Solver 
Version: 2.10.10 
Build Date: Aug  1 2023 

command line - cbc /var/folders/01/w3k5_y_x4mg_4y82c4n3pz0c0000gn/T/5dd2f4c09e57407dadd02f48b8421cf5-pulp.mps timeMode elapsed branch printingOptions all solution /var/folders/01/w3k5_y_x4mg_4y82c4n3pz0c0000gn/T/5dd2f4c09e57407dadd02f48b8421cf5-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 115 COLUMNS
At line 5084 RHS
At line 5195 BOUNDS
At line 6024 ENDATA
Problem MODEL has 110 rows, 828 columns and 2484 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 12900.9 - 0.00 seconds
Cgl0005I 92 SOS with 828 members
Cgl0004I processed model has 101 rows, 828 columns (828 integer (828 of which binary)) and 1656 elements
Cbc0038I Initial state - 3 integers unsatisfied sum - 0.270178
Cbc0038I Pass   1: suminf.    0.27018 (4) obj. 12900.9 iterations 20
Cbc0038I Pass   2: suminf.    0.84630 (3) obj. 12900.9 iteratio



In [114]:
results_df = results_df.rename(columns = {"County": "County_Code"})
results_df

Unnamed: 0,County_Code,District
0,18001,5
1,18003,8
2,18005,6
3,18007,8
4,18009,7
...,...,...
87,18175,9
88,18177,1
89,18179,8
90,18181,1


In [112]:
county_codes = adjacent[["County", "County_Code"]]
county_codes = county_codes.drop_duplicates()
county_codes

Unnamed: 0,County,County_Code
0,Adams County,18001
3,Allen County,18003
9,Bartholomew County,18005
15,Benton County,18007
21,Blackford County,18009
...,...,...
462,Washington County,18175
470,Wayne County,18177
474,Wells County,18179
480,White County,18181


In [117]:
results_final = pd.merge(results_df, county_codes, on=['County_Code', 'County_Code'])
results_final
results_final.to_csv("county_district_assignments2.csv")