In [1]:
%%capture
pip install overpy

In [2]:
%%capture
pip install geopy osmnx

In [3]:
%%capture
pip install googlemaps

### Importing Necessary Libraries

In [4]:
import pandas as pd
import overpy
import time
import googlemaps
from shapely.geometry import Point
from datetime import datetime
from geopy.distance import great_circle

### Importing Input Data Files

In [5]:
# You may use the entire dataset instead of the sample file
sample_df = pd.read_csv('Data Files/sample_store_coordinates.csv')

# Traffic csv consists of the Average Annual Daily Traffic extracted from Department of Transportation Portal
traffic_df = pd.read_csv('Data Files/Traffic.csv')

In [6]:
sample_df.head()

Unnamed: 0,Latitude,Longitude,Visits
0,42.996667,-85.594391,54697
1,42.242442,-85.587246,38199
2,42.082033,-86.421991,30639
3,42.254734,-84.3596,35253
4,41.631215,-87.927783,24456


In [7]:
traffic_df.head()

Unnamed: 0,Latitude,Longitude,Traffic
0,40.653602,-95.857076,1470
1,40.652688,-95.857171,1450
2,40.652472,-95.861568,1805
3,40.653357,-95.861524,1920
4,40.814015,-96.293697,1355


<hr style="border:2px solid #000"/>

# Nearby Places Counts (within 1, 3 & 5 miles)

In [8]:
# Initialize Overpass API
api = overpy.Overpass()

# Define radii in miles and their conversion to meters for the required radii only
radii_miles_to_meters = {
    1: 1609.34,
    3: 3 * 1609.34,
    5: 5 * 1609.34
}

# Define categorxies and their tags for required categories and radii only
required_categories_tags = {
    "Grocery_Stores_1mile": [
        ("amenity", "supermarket"),
        ("shop", "bakery"),
        ("shop", "convenience"),
        ("shop", "frozen_food"),
        ("shop", "department_store"),
        ("shop", "supermarket"),
        ("shop", "wholesale")
   ],
    "Food_Outlets_1mile": [
        ("amenity", "fast_food"),
        ("amenity", "food_court"),
        ("amenity", "restaurant"),
        ("amenity", "ice_cream"),
        ("shop", "cheese"),
        ("shop", "chocolate"),
        ("shop", "pasta"),
        ("shop", "pastry"),
        ("shop", "seafood"),
        ("shop", "food")
    ],
    "Educational_Institutions_1mile": [
        ("building", "college"),
        ("building", "school"),
        ("amenity", "library"),
        ("building", "university")
    ],
    "Residential_Housing_1mile": [
        ("building", "bungalow"),
        ("building", "apartments"),
        ("building", "residential"),
        ("building", "hut")
    ],
    "Commercial_Housing_1mile": [
        ("building", "commercial"),
        ("building", "office")
   ],
    "Fuel_Stations_Car_Wash_1mile": [
        ("amenity", "car_wash"),
        ("amenity", "charging_station"),
        ("amenity", "fuel")
    ],
    "Hospitals_3miles": [
        ("amenity", "clinic"),
        ("amenity", "nursing_home"),
        ("amenity", "pharmacy"),
        ("building", "hospital")
    ],
    "Apparels_3miles": [
        ("shop", "clothes"),
        ("shop", "fabric"),
        ("shop", "fashion")
    ],
    "Tourist_Destinations_5miles": [
        ("tourism", "attraction"),
        ("amenity", "events_venue"),
        ("amenity", "exhibition_centre"),
        ("boundary", "national_park"),
        ("boundary", "museum"),
        ("building", "beach_hut"),
        ("building", "castle")
   ],
    "Hotels_5miles": [
        ("building", "hotel")
    ],
    "Airports_5miles": [
        ("aeroway", "helipad"),
        ("aeroway", "heliport"),
        ("aeroway", "runway"),
        ("aeroway", "terminal"),
        ("aeroway", "aerodrome")
    ]
}

# Define a delay duration in seconds
delay_duration = 0.2

def fetch_feature_count(api, lat, lon, required_categories_tags):
    category_counts = {}
    
    for category, tags in required_categories_tags.items():
        suffix = category.split("_")[-1]  # Extract the mile part from the category name
        radius = radii_miles_to_meters[int(suffix[0])]  # Extract the radius in miles and convert to meters
        print(f"Fetching data for category: {category}")
        category_count = 0
        for tag_pair in tags:
            category_name, tag_name = tag_pair
            query = f"""
                (node["{category_name}"="{tag_name}"](around:{radius},{lat},{lon});
                 way["{category_name}"="{tag_name}"](around:{radius},{lat},{lon});
                 rel["{category_name}"="{tag_name}"](around:{radius},{lat},{lon});
                );out;
            """
            result = api.query(query)
            category_count += len(result.nodes) + len(result.ways) + len(result.relations)
        
        category_counts[category] = category_count
        print(f"Completed fetching for category: {category}. Count: {category_count}")
        
        # Delay to manage server load
        time.sleep(delay_duration)

    return category_counts

def main(input_df):
    all_feature_counts = []

    for index, row in input_df.iterrows():
        lat = row['Latitude']
        lon = row['Longitude']
        feature_counts = fetch_feature_count(api, lat, lon, required_categories_tags)
        feature_counts.update({
            "Latitude": lat,
            "Longitude": lon
        })
        all_feature_counts.append(feature_counts)

    # Convert the list of dictionaries to a DataFrame
    final_df = pd.DataFrame(all_feature_counts)

    # Display and save the DataFrame
    print(final_df)
    final_df.to_csv("Data Files/nearby_places_sample.csv", index=False)
    print("Competition data collection complete. CSV file created.")

if __name__ == "__main__":
    # Assuming meijer_stores is defined elsewhere with 'Latitude' and 'Longitude' columns
    main(sample_df)

Fetching data for category: Grocery_Stores_1mile
Completed fetching for category: Grocery_Stores_1mile. Count: 5
Fetching data for category: Food_Outlets_1mile
Completed fetching for category: Food_Outlets_1mile. Count: 35
Fetching data for category: Educational_Institutions_1mile
Completed fetching for category: Educational_Institutions_1mile. Count: 11
Fetching data for category: Residential_Housing_1mile
Completed fetching for category: Residential_Housing_1mile. Count: 8
Fetching data for category: Commercial_Housing_1mile
Completed fetching for category: Commercial_Housing_1mile. Count: 1
Fetching data for category: Fuel_Stations_Car_Wash_1mile
Completed fetching for category: Fuel_Stations_Car_Wash_1mile. Count: 4
Fetching data for category: Hospitals_3miles
Completed fetching for category: Hospitals_3miles. Count: 17
Fetching data for category: Apparels_3miles
Completed fetching for category: Apparels_3miles. Count: 0
Fetching data for category: Tourist_Destinations_5miles
Compl

In [9]:
df = pd.read_csv('Data Files/nearby_places_sample.csv')

# Performing the merge
df = pd.merge(sample_df, df, on=['Latitude','Longitude'], how='inner')
df.head()

Unnamed: 0,Latitude,Longitude,Visits,Grocery_Stores_1mile,Food_Outlets_1mile,Educational_Institutions_1mile,Residential_Housing_1mile,Commercial_Housing_1mile,Fuel_Stations_Car_Wash_1mile,Hospitals_3miles,Apparels_3miles,Tourist_Destinations_5miles,Hotels_5miles,Airports_5miles
0,42.996667,-85.594391,54697,5,35,11,8,1,4,17,0,4,3,7
1,42.242442,-85.587246,38199,8,24,5,24,4,6,7,9,4,2,7
2,42.082033,-86.421991,30639,10,3,1,3,1,7,2,1,0,5,4
3,42.254734,-84.3596,35253,4,5,0,0,1,2,10,3,0,0,4
4,41.631215,-87.927783,24456,7,28,1,4,1,6,4,1,2,1,1


<hr style="border:2px solid #000"/>

# Nearest Highway Ramp Entry/Exit

In [10]:
def find_nearest_motorway_entrance(lat, lon):
    """Find the nearest motorway entrance using the Overpass API."""
    api = overpy.Overpass()
    result = api.query(f"""
        [out:json];
        (
          way[highway=motorway_link](around:100000, {lat}, {lon});
        );
        (._;>;);  // Fetch all associated nodes
        out body;
    """)
    
    nearest_distance = float('inf')
    nearest_node_coordinates = None

    for way in result.ways:
        for node in way.nodes:
            node_lat, node_lon = float(node.lat), float(node.lon)
            # Calculate the Euclidean distance
            distance = Point(lon, lat).distance(Point(node_lon, node_lat))
            if distance < nearest_distance:
                nearest_distance = distance
                nearest_node_coordinates = (node_lat, node_lon)

    return nearest_node_coordinates

# Extract latitude and longitude columns into lists
latitudes = df['Latitude'].tolist()
longitudes = df['Longitude'].tolist()

# Initialize new columns for motorway entrance coordinates
df['Nearest Motorway Entrance Lat'] = None
df['Nearest Motorway Entrance Lon'] = None

# Iterate over the latitude and longitude lists
for i in range(len(latitudes)):
    lat, lon = latitudes[i], longitudes[i]
    nearest_motorway_entrance = find_nearest_motorway_entrance(lat, lon)
    if nearest_motorway_entrance:
        df.at[i, 'Nearest Motorway Entrance Lat'] = nearest_motorway_entrance[0]
        df.at[i, 'Nearest Motorway Entrance Lon'] = nearest_motorway_entrance[1]

In [11]:
df.head()

Unnamed: 0,Latitude,Longitude,Visits,Grocery_Stores_1mile,Food_Outlets_1mile,Educational_Institutions_1mile,Residential_Housing_1mile,Commercial_Housing_1mile,Fuel_Stations_Car_Wash_1mile,Hospitals_3miles,Apparels_3miles,Tourist_Destinations_5miles,Hotels_5miles,Airports_5miles,Nearest Motorway Entrance Lat,Nearest Motorway Entrance Lon
0,42.996667,-85.594391,54697,5,35,11,8,1,4,17,0,4,3,7,42.988574,-85.604723
1,42.242442,-85.587246,38199,8,24,5,24,4,6,7,9,4,2,7,42.237712,-85.589592
2,42.082033,-86.421991,30639,10,3,1,3,1,7,2,1,0,5,4,42.078756,-86.418161
3,42.254734,-84.3596,35253,4,5,0,0,1,2,10,3,0,0,4,42.255053,-84.356934
4,41.631215,-87.927783,24456,7,28,1,4,1,6,4,1,2,1,1,41.552,-87.909939


<hr style="border:2px solid #000"/>

# Distance & Duration to Nearest Highway Ramp Entry/Exit

In [12]:
def get_distance_and_duration(origin, destination, api_key):
    gmaps = googlemaps.Client(key=api_key)
    try:
        # Request directions
        directions_result = gmaps.directions(origin, destination, mode="driving", departure_time=datetime.now())
        
        # Extract distance and duration
        if directions_result:
            distance = directions_result[0]['legs'][0]['distance']['text']
            duration = directions_result[0]['legs'][0]['duration']['text']
            return distance, duration
        else:
            return None, None
    except Exception as e:
        print(f"Error in Google Maps API request: {e}")
        return None, None

# Example usage
api_key = 'AIzaSyDGIOjh0Iat430IjsQ9Y4aIdlRcPcOs3fk'  # Replace with your actual API key

# Assuming df is your DataFrame with 'latitude', 'longitude', 'Nearest Motorway Entrance Lat', and 'Nearest Motorway Entrance Lon' columns
df['Driving Distance to Nearest Motorway'] = None
df['Duration to Nearest Motorway'] = None

for index, row in df.iterrows():
    origin = f"{row['Latitude']}, {row['Longitude']}"
    if pd.notna(row['Nearest Motorway Entrance Lat']) and pd.notna(row['Nearest Motorway Entrance Lon']):
        destination = f"{row['Nearest Motorway Entrance Lat']}, {row['Nearest Motorway Entrance Lon']}"
        distance, duration = get_distance_and_duration(origin, destination, api_key)
        df.at[index, 'Driving Distance to Nearest Motorway'] = distance
        df.at[index, 'Duration to Nearest Motorway'] = duration

In [13]:
df.head()

Unnamed: 0,Latitude,Longitude,Visits,Grocery_Stores_1mile,Food_Outlets_1mile,Educational_Institutions_1mile,Residential_Housing_1mile,Commercial_Housing_1mile,Fuel_Stations_Car_Wash_1mile,Hospitals_3miles,Apparels_3miles,Tourist_Destinations_5miles,Hotels_5miles,Airports_5miles,Nearest Motorway Entrance Lat,Nearest Motorway Entrance Lon,Driving Distance to Nearest Motorway,Duration to Nearest Motorway
0,42.996667,-85.594391,54697,5,35,11,8,1,4,17,0,4,3,7,42.988574,-85.604723,1.8 mi,4 mins
1,42.242442,-85.587246,38199,8,24,5,24,4,6,7,9,4,2,7,42.237712,-85.589592,0.6 mi,3 mins
2,42.082033,-86.421991,30639,10,3,1,3,1,7,2,1,0,5,4,42.078756,-86.418161,0.5 mi,2 mins
3,42.254734,-84.3596,35253,4,5,0,0,1,2,10,3,0,0,4,42.255053,-84.356934,0.4 mi,2 mins
4,41.631215,-87.927783,24456,7,28,1,4,1,6,4,1,2,1,1,41.552,-87.909939,14.7 mi,18 mins


<hr style="border:2px solid #000"/>

# Traffic (AADT)

In [14]:
# Add new column for traffic data
df['Traffic (AADT)'] = None

def find_nearest_traffic_data(store_lat, store_lon, traffic_df):
    nearest_distance = float('inf')
    nearest_traffic_data = None

    for _, traffic_row in traffic_df.iterrows():
        traffic_lat = traffic_row['Latitude']
        traffic_lon = traffic_row['Longitude']
        distance = great_circle((store_lat, store_lon), (traffic_lat, traffic_lon)).meters

        if distance < nearest_distance:
            nearest_distance = distance
            nearest_traffic_data = traffic_row['Traffic']

    return nearest_traffic_data

# Iterate over each store in df
for index, row in df.iterrows():
    store_lat = row['Latitude']
    store_lon = row['Longitude']
    nearest_traffic = find_nearest_traffic_data(store_lat, store_lon, traffic_df)
    df.at[index, 'Traffic (AADT)'] = nearest_traffic

# Print or save the updated DataFrame
df.to_csv('Data Files/scraped_df_sample.csv', index=False)

In [15]:
df.head()

Unnamed: 0,Latitude,Longitude,Visits,Grocery_Stores_1mile,Food_Outlets_1mile,Educational_Institutions_1mile,Residential_Housing_1mile,Commercial_Housing_1mile,Fuel_Stations_Car_Wash_1mile,Hospitals_3miles,Apparels_3miles,Tourist_Destinations_5miles,Hotels_5miles,Airports_5miles,Nearest Motorway Entrance Lat,Nearest Motorway Entrance Lon,Driving Distance to Nearest Motorway,Duration to Nearest Motorway,Traffic (AADT)
0,42.996667,-85.594391,54697,5,35,11,8,1,4,17,0,4,3,7,42.988574,-85.604723,1.8 mi,4 mins,13114.0
1,42.242442,-85.587246,38199,8,24,5,24,4,6,7,9,4,2,7,42.237712,-85.589592,0.6 mi,3 mins,32726.0
2,42.082033,-86.421991,30639,10,3,1,3,1,7,2,1,0,5,4,42.078756,-86.418161,0.5 mi,2 mins,8477.0
3,42.254734,-84.3596,35253,4,5,0,0,1,2,10,3,0,0,4,42.255053,-84.356934,0.4 mi,2 mins,5913.0
4,41.631215,-87.927783,24456,7,28,1,4,1,6,4,1,2,1,1,41.552,-87.909939,14.7 mi,18 mins,1183.0
