**Calculating Reaches**

In [10]:
# Separating Final_dataset.csv into 15 separate files
import pandas as pd

# Read the dataset
df = pd.read_csv("input_folder/Final_dataset.csv")

# Group data by year
year_groups = df.groupby('year')

# Function to write each year data to a separate file
def write_year_data(year, data):
  filename = f"input_folder/junction_counts_{year}.csv" 
  data.groupby('id')
  data.to_csv(filename, index=False)

# Loop through each year group and write data to separate files
for year, group_data in year_groups:
  write_year_data(year, group_data)

print("Data successfully separated into yearly files!")


Data successfully separated into yearly files!


*Time Invariant*

In [11]:
# Calculate the reaches of the time invariant features

import sys
sys.path.append('../') # This should probably be changed to a more sofisticated system at some point. i.e. install the package

import math

from heapq import heappush, heappop

from ast import literal_eval
from data_wrangler.dataset import Dataset

INPUT_FOLDER = 'input_folder'
OUTPUT_FOLDER = 'output_folder'

JUNCTION_FILE = f'{INPUT_FOLDER}/junction_counts_2008.csv'

CRIME_SIGMA = 132
STANDARD_DEVIATION = 400

junctions = Dataset.load_file(JUNCTION_FILE)
junctions.convert_properties({
    'id': int,
    'crime_count': int,
    'store_count': int,
    'police_count': int,
    'transit_count': int,
    'graffiti_count': int,
    'homeless_shelter_count': int,
    'traffic_signal_count': int,
    'street_lighting_poles_count': int,
    'schools_count': int,
    'neighbors': lambda v : literal_eval(v) if v else []
})

def normal_dst(distance, standard_deviation):
    scale = 1 / (2 * math.pi * (standard_deviation ** 2))
    power = distance ** 2 / (2 * standard_deviation ** 2)
    distribution = math.exp(-power)
    return scale * distribution

def reach_dst(distance, scale):
    """ Calculate a modified version of Borgatti's reach formula
    
    TODO: Check that convergence is important and that if is whether we actually need to cube the denominator
    The range formula is:  sumweight * 1 / (dst_scale * dst + 1) ^ 2
    We have +1 because we want distance of zero to be constant with respect to dst_scale
    We cube the denominator because this causes it to converge
    """
    
    return 1 / ((distance / scale + 1) ** 3)

def calculate_reach(junction, properties, dst_func, limit=float('inf')):
    """
    Args:
        junction (Row): The junction to calculate the reach for
        prop (str): The property to use for junction weights
        dst_scale (float): The value to scale distance by. Should be in the range (0, 1]. Likely close to zero.

    Returns:
        float: The calculated reach.
    """
    reaches = { key: 0 for key in properties}
    visited = set()
    queue = []
    heappush(queue, (0, junction['id']))
    while queue:
        dst, next_jun = heappop(queue)
        if next_jun in visited: continue
        visited.add(next_jun)
        if dst > limit: continue
        
        # The range formula is: weight * 1 / (dst_scale * dst + 1) ^ 2
        # We have +1 because we want distance of zero to be constant with respect to dst_scale
        # We square the denominator because this causes it to converge
        
        # Update the range values
        crime_dst = normal_dst(dst, CRIME_SIGMA)
        scaled_dst = dst_func(dst)
        for key in properties:
            if key == 'crime_reach':
                reaches[key] += junctions[next_jun][properties[key]] * crime_dst
            else:
                reaches[key] += junctions[next_jun][properties[key]] * scaled_dst
              
        for neighbor, delta, s_id in junctions[next_jun]['neighbors']:
            if neighbor in visited: continue
            neighbor_dst = dst + delta
            heappush(queue, (neighbor_dst, neighbor))
    return reaches

def calculate_reaches(junctions, properties, dst_func, limit=float('inf')):
    highest = { key: 0 for key in properties}
    
    for i, junction in enumerate(junctions):
        reaches = calculate_reach(junction, properties, dst_func, limit)
        for key in reaches:
            junction[key] = reaches[key]
            highest[key] = max(highest[key], reaches[key])
        
        if (i+1) % 100 == 0:
            print(f'\rCalculated {i+1}/{len(junctions)}           ', end='')
    print(f'\rCalculated {len(junctions)}/{len(junctions)}        ')
    print("Normalizing")
    for junction in junctions:
        for key in properties:
            junction[key] /= highest[key]
    print("Done")
    
calculate_reaches(
    junctions, 
    {
        # 'crime_reach': 'crime_count',
        # 'store_reach': 'store_count',
        # 'police_reach': 'police_count',
        # 'transit_reach': 'transit_count',
        'graffiti_reach': 'graffiti_count',
        'homeless_shelter_reach': 'homeless_shelter_count',
        'traffic_signal_reach': 'traffic_signal_count',
        'street_lighting_poles_reach': 'street_lighting_poles_count',
        'schools_reach': 'schools_count'
    }, 
    lambda dst: normal_dst(dst, STANDARD_DEVIATION),
    limit=1000
)
junctions.write_to_file(f'{OUTPUT_FOLDER}/reach_junctions_time_invariant.csv')

Calculated 6179/6179           
Normalizing
Done


*Time Variant*

In [17]:
# Calculate the reaches of the time variant features for each year

import sys
sys.path.append('../') # This should probably be changed to a more sofisticated system at some point. i.e. install the package

import math

from heapq import heappush, heappop

from ast import literal_eval
from data_wrangler.dataset import Dataset

def normal_dst(distance, standard_deviation):
    scale = 1 / (2 * math.pi * (standard_deviation ** 2))
    power = distance ** 2 / (2 * standard_deviation ** 2)
    distribution = math.exp(-power)
    return scale * distribution

def reach_dst(distance, scale):
    """ Calculate a modified version of Borgatti's reach formula
    
    TODO: Check that convergence is important and that if is whether we actually need to cube the denominator
    The range formula is:  sumweight * 1 / (dst_scale * dst + 1) ^ 2
    We have +1 because we want distance of zero to be constant with respect to dst_scale
    We cube the denominator because this causes it to converge
    """
    
    return 1 / ((distance / scale + 1) ** 3)

def calculate_reach(junction, properties, dst_func, limit=float('inf')):
    """
    Args:
        junction (Row): The junction to calculate the reach for
        prop (str): The property to use for junction weights
        dst_scale (float): The value to scale distance by. Should be in the range (0, 1]. Likely close to zero.

    Returns:
        float: The calculated reach.
    """
    reaches = { key: 0 for key in properties}
    visited = set()
    queue = []
    heappush(queue, (0, junction['id']))
    while queue:
        dst, next_jun = heappop(queue)
        if next_jun in visited: continue
        visited.add(next_jun)
        if dst > limit: continue
        
        # The range formula is: weight * 1 / (dst_scale * dst + 1) ^ 2
        # We have +1 because we want distance of zero to be constant with respect to dst_scale
        # We square the denominator because this causes it to converge
        
        # Update the range values
        crime_dst = normal_dst(dst, CRIME_SIGMA)
        scaled_dst = dst_func(dst)
        for key in properties:
            if key == 'crime_reach':
                reaches[key] += junctions[next_jun][properties[key]] * crime_dst
            else:
                reaches[key] += junctions[next_jun][properties[key]] * scaled_dst
              
        for neighbor, delta, s_id in junctions[next_jun]['neighbors']:
            if neighbor in visited: continue
            neighbor_dst = dst + delta
            heappush(queue, (neighbor_dst, neighbor))
    return reaches

def calculate_reaches(junctions, properties, dst_func, limit=float('inf')):
    highest = { key: 0 for key in properties}
    
    for i, junction in enumerate(junctions):
        reaches = calculate_reach(junction, properties, dst_func, limit)
        for key in reaches:
            junction[key] = reaches[key]
            highest[key] = max(highest[key], reaches[key])
        
        if (i+1) % 100 == 0:
            print(f'\rCalculated {i+1}/{len(junctions)}           ', end='')
    print(f'\rCalculated {len(junctions)}/{len(junctions)}        ')
    print("Normalizing")
    for junction in junctions:
        for key in properties:
            junction[key] /= highest[key]
    print("Done")

INPUT_FOLDER = 'input_folder'
OUTPUT_FOLDER = 'output_folder'

CRIME_SIGMA = 132
STANDARD_DEVIATION = 400

for year in range(2008, 2024):
    junction_file = f'{INPUT_FOLDER}/junction_counts_{year}.csv'
    junctions = Dataset.load_file(junction_file)
    junctions.convert_properties({
    'id': int,
    'crime_count': int,
    'store_count': int,
    'police_count': int,
    'transit_count': int,
    'graffiti_count': int,
    'homeless_shelter_count': int,
    'traffic_signal_count': int,
    'street_lighting_poles_count': int,
    'schools_count': int,
    'neighbors': lambda v : literal_eval(v) if v else []
    })
    calculate_reaches(
        junctions, 
        {
            'crime_reach': 'crime_count',
            'store_reach': 'store_count',
            'police_reach': 'police_count',
            'transit_reach': 'transit_count',
            # 'graffiti_reach': 'graffiti_count',
            # 'homeless_shelter_reach': 'homeless_shelter_count',
            # 'traffic_signal_reach': 'traffic_signal_count',
            # 'street_lighting_poles_reach': 'street_lighting_poles_count',
            # 'schools_reach': 'schools_count'
        }, 
        lambda dst: normal_dst(dst, STANDARD_DEVIATION),
        limit=1000
    )
    junctions.write_to_file(f'{OUTPUT_FOLDER}/reach_junctions_time_variant_{year}.csv')

Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done
Calculated 6179/6179           
Normalizing
Done


In [18]:
# Convert all csv files in folder to excel
import pandas as pd
import os

# Define the folder path containing CSV files
folder_path = "output_folder"  # Replace with your actual folder path

# Loop through all files in the folder
for filename in os.listdir(folder_path):
  # Check if the file is a CSV file
  if filename.endswith(".csv"):
    # Construct the full path to the CSV file
    csv_file = os.path.join(folder_path, filename)
    
    # Read the CSV data using pandas
    df = pd.read_csv(csv_file)
    
    # Construct the output filename (replace '.csv' with '.xlsx')
    xlsx_file = os.path.splitext(filename)[0] + ".xlsx"
    
    # Save the DataFrame to an Excel file
    df.to_excel(os.path.join(folder_path, xlsx_file), index=False)  # Set index=False to exclude row index

print(f"Successfully converted all CSV files in {folder_path} to Excel!")


Successfully converted all CSV files in output_folder to Excel!
