In [None]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
import json
from time import sleep
import os
import random
from datetime import datetime, timedelta
import geopandas as gpd
from shapely.geometry import Point
import time
import numpy as np
from matplotlib.colors import TwoSlopeNorm, Normalize
from matplotlib.cm import ScalarMappable
import matplotlib.colors as mcolors
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pickle
from collections import Counter
from geopy.distance import geodesic

## Load POI data that are located within a 400 m radius of any EVCS

In [None]:
relativechange400mfinal = pd.read_csv('Sample_TotalPOIsData_within400m.csv')
relativechange400mfinal

In [None]:
def filter_pois_by_city_and_group(df):
    city_group_dfs = {}

    grouped = df.groupby(['city', 'group'])

    for (city, grp), group_df in grouped:
        if not group_df.empty:
            key = f"{city}_{grp}"

            city_group_dfs[key] = group_df[['city', 'group', 'apiId', 'avg_pre_open', 'avg_post_open', 'Zscore', 'station_name', 'open_date', 'poi_zipcode']]

    return city_group_dfs

In [None]:
city_grouped_pois = filter_pois_by_city_and_group(relativechange400mfinal)

In [None]:
final_tables = {}

In [None]:
for key, df in city_grouped_pois.items():
    if not df.empty:
        if key not in final_tables:
            final_tables[key] = df
        else:
            final_tables[key] = pd.concat([final_tables[key], df], ignore_index=True)

In [None]:
for key, df in final_tables.items():
    print(f"DataFrame for {key}:")
    print(df)
    print()

## Selection of potential control candidate POI group

In [None]:
def find_pois_with_different_station_name(final_tables):
    poi_results = []

    for key, df in final_tables.items():
        df['open_date'] = pd.to_datetime(df['open_date'])

        for i, poi_row in df.iterrows():
            original_poi_id = poi_row['apiId']
            original_city = poi_row['city']
            original_group = poi_row['group']
            original_station_name = poi_row['station_name'].strip().lower()  
            original_open_date = poi_row['open_date']
            original_avg_pre_open = poi_row['avg_pre_open']
            original_avg_post_open = poi_row['avg_post_open']
            original_Zscore_change = poi_row['Zscore']

            similar_pois = df[
                (df['apiId'] != original_poi_id) &  # Exclude the original POI
                (df['station_name'].str.strip().str.lower() != original_station_name) &  
                (df['open_date'] >= original_open_date + pd.DateOffset(months=3))  
            ]

            if not similar_pois.empty:
                similar_pois = similar_pois.drop_duplicates(subset=['apiId'])

                # Append the results with the original POI and its corresponding similar POIs
                poi_results.append({
                    'original_poi_id': original_poi_id,
                    'original_city': original_city,
                    'original_group': original_group,
                    'original_station_name': poi_row['station_name'],  
                    'original_open_date': original_open_date,
                    'original_avg_pre_open': original_avg_pre_open,
                    'original_avg_post_open': original_avg_post_open,
                    'original_Zscore_change': original_Zscore_change,
                    'similar_pois': similar_pois[[
                        'apiId', 'poi_zipcode', 'group', 'station_name', 'open_date', 
                        'avg_pre_open', 'avg_post_open', 'Zscore', 'city'
                    ]].to_dict(orient='records')  # Convert the similar POIs to a list of dictionaries
                })

    return poi_results

In [None]:
poi_results = find_pois_with_different_station_name(final_tables)
poi_results

In [None]:
pickle_filename = 'poi_results.pkl'

In [None]:
with open(pickle_filename, 'wb') as file:
    pickle.dump(poi_results, file)
print(f"Results have been saved to {pickle_filename}")

In [None]:
for result in poi_results:
    print(f"Original POI ID: {result['original_poi_id']} (City: {result['original_city']}, Group: {result['original_group']}, Station: {result['original_station_name']}, Open Date: {result['original_open_date']}, Avg Pre Open: {result['original_avg_pre_open']}, Avg Post Open: {result['original_avg_post_open']}, Zscore Change: {result['original_Zscore_change']})")
    
    print("Similar POIs:")
    for poi in result['similar_pois']:
        print(f"  - POI ID: {poi['apiId']}, City: {poi['city']}, Group: {poi['group']}, Station: {poi['station_name']}, Open Date: {poi['open_date']}, Avg Pre Open: {poi['avg_pre_open']}, Avg Post Open: {poi['avg_post_open']}, Zscore Change: {poi['Zscore']}")
    print()

In [None]:
original_groups = [result['original_group'] for result in poi_results]

In [None]:
group_counts = Counter(original_groups)

In [None]:
print("Original POI Groups and their counts:")
for group, count in group_counts.items():
    print(f"Group: {group}, Count: {count}")

In [None]:
poi_dataframes = {}

In [None]:
for result in poi_results:
    # Extract the original POI information
    original_poi_id = result['original_poi_id']
    original_open_date = result['original_open_date']
    original_station_name = result['original_station_name']
    original_city = result['original_city']  
    original_group = result['original_group']  

    # Calculate the 3 months before and after the original open date
    three_months_before = original_open_date - pd.DateOffset(months=3)
    three_months_after = original_open_date + pd.DateOffset(months=3)

    similar_pois_with_dates = []

    # Loop through similar POIs
    for poi in result['similar_pois']:
        similar_poi_info = {
            'POI ID': poi['apiId'],
            'City': poi['city'],  
            'Group': poi['group'],  
            'Station': poi['station_name'],
            'Open Date': poi['open_date'],
            'Avg Pre Open': poi['avg_pre_open'],
            'Avg Post Open': poi['avg_post_open'],
            'Zscore Change': poi['Zscore'],
            'Original POI Open Date': original_open_date,
            'Original Station Name': original_station_name,
            'Original POI City': original_city, 
            'Original POI Group': original_group,  
            '3 Months Before Original POI Open Date': three_months_before,
            '3 Months After Original POI Open Date': three_months_after
        }

        similar_pois_with_dates.append(similar_poi_info)

    df = pd.DataFrame(similar_pois_with_dates)

    poi_dataframes[original_poi_id] = df

In [None]:
for poi_id, df in poi_dataframes.items():
    print(f"DataFrame for Original POI ID {poi_id}:")
    print(df)
    print("\n")

In [None]:
#Load daily visits dataframe
final_places_visits = pd.read_csv("DailyvisitsbyPOI.csv")

In [None]:
final_places_visits['Date'] = pd.to_datetime(final_places_visits['Date'])

In [None]:
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

In [None]:
def process_poi_batches(poi_results, batch_size=500, output_file='poi_dataframes_with_means.pkl3'):
    with open(output_file, 'wb') as file:
        poi_dataframes_with_means = {}

        for batch_start in range(0, len(poi_results), batch_size):
            batch = poi_results[batch_start:batch_start + batch_size]
            
            for result in batch:
                original_poi_id = result['original_poi_id']
                original_open_date = result['original_open_date']
                original_station_name = result['original_station_name']
                original_city = result['original_city']
                
                three_months_before = original_open_date - pd.DateOffset(months=3)
                six_months_before = original_open_date - pd.DateOffset(months=6)
                three_months_after = original_open_date + pd.DateOffset(months=3)
                
                similar_pois_with_dates_list = []
                
                original_group = final_places_visits.loc[final_places_visits['apiId'] == original_poi_id]

                original_pre_pre_open_data = original_group[(original_group['Date'] >= six_months_before) & (original_group['Date'] < three_months_before)]
                original_pre_open_data = original_group[(original_group['Date'] >= three_months_before) & (original_group['Date'] < original_open_date)]
                original_post_open_data = original_group[(original_group['Date'] > original_open_date) & (original_group['Date'] <= three_months_after)]

                original_avg_pre_pre_open = original_pre_pre_open_data['Visits'].mean() if not original_pre_pre_open_data.empty else None
                original_avg_pre_open = original_pre_open_data['Visits'].mean() if not original_pre_open_data.empty else None
                original_avg_post_open = original_post_open_data['Visits'].mean() if not original_post_open_data.empty else None
                
                for poi in result['similar_pois']:
                    similar_group = final_places_visits.loc[final_places_visits['apiId'] == poi['apiId']]

                    similar_pre_pre_open_data = similar_group[(similar_group['Date'] >= six_months_before) & (similar_group['Date'] < three_months_before)]
                    similar_pre_open_data = similar_group[(similar_group['Date'] >= three_months_before) & (similar_group['Date'] < original_open_date)]
                    similar_post_open_data = similar_group[(similar_group['Date'] > original_open_date) & (similar_group['Date'] <= three_months_after)]

                    similar_avg_pre_pre_open = similar_pre_pre_open_data['Visits'].mean() if not similar_pre_pre_open_data.empty else None
                    similar_avg_pre_open = similar_pre_open_data['Visits'].mean() if not similar_pre_open_data.empty else None
                    similar_avg_post_open = similar_post_open_data['Visits'].mean() if not similar_post_open_data.empty else None

                    similar_pois_with_dates_list.append({
                        'POI ID': poi['apiId'],
                        'Zip Code': poi['poi_zipcode'],
                        'Group': poi['group'],
                        'Station': poi['station_name'],
                        'Open Date': poi['open_date'],
                        'City': poi['city'],
                        'Avg Pre Pre Open': similar_avg_pre_pre_open,
                        'Avg Pre Open': similar_avg_pre_open,
                        'Avg Post Open': similar_avg_post_open,
                        'Original POI Open Date': original_open_date,
                        'Original Station Name': original_station_name,
                        'Original City': original_city,
                        '6 Months Before Original POI Open Date': six_months_before,
                        '3 Months Before Original POI Open Date': three_months_before,
                        '3 Months After Original POI Open Date': three_months_after
                    })
                
                poi_dataframes_with_means[original_poi_id] = {
                    'similar_pois_data': similar_pois_with_dates_list,  # Store the list of similar POIs with calculated data
                    'original_avg_pre_pre_open': original_avg_pre_pre_open,
                    'original_avg_pre_open': original_avg_pre_open,
                    'original_avg_post_open': original_avg_post_open
                }
            
            pickle.dump(poi_dataframes_with_means, file)
            print(f'Batch {batch_start // batch_size + 1} processed and saved.')
            
            poi_dataframes_with_means.clear()

process_poi_batches(poi_results, batch_size=500)

In [None]:
def load_pickle_data(file_path):
    """Load and combine batch data from the pickle file into a DataFrame."""
    all_data = []
    
    with open(file_path, 'rb') as file:
        while True:
            try:
                batch_data = pickle.load(file)
                all_data.append(batch_data)
            except EOFError:
                break

    return all_data

In [None]:
def create_dataframe_from_poi_data(all_data):
    """Convert the loaded data into a DataFrame with unique pairs."""
    rows = []
    processed_pairs = set()  

    for batch_data in all_data:
        for original_poi_id, poi_info in batch_data.items():
            for similar_poi in poi_info['similar_pois_data']:
                similar_poi_id = similar_poi['POI ID']
                
                if (original_poi_id, similar_poi_id) in processed_pairs:
                    continue

                processed_pairs.add((original_poi_id, similar_poi_id))
                rows.append({
                    'Original POI ID': original_poi_id,
                    'Original Avg Pre Pre Open': poi_info['original_avg_pre_pre_open'],
                    'Original Avg Pre Open': poi_info['original_avg_pre_open'],
                    'Original Avg Post Open': poi_info['original_avg_post_open'],
                    **similar_poi  
                })

    df = pd.DataFrame(rows)
    return df

file_path = 'poi_dataframes_with_means.pkl3'
all_data = load_pickle_data(file_path)

poi_df = create_dataframe_from_poi_data(all_data)

In [None]:
poi_df

In [None]:
poi_df.to_csv("Citylevelmatchedsynthetic.csv", index=False)

In [None]:
size_thresholds = [0, 50, 100, 200, 500, 1000, 2500, 5000, 10000, 25000, np.inf]
size_labels = ['0-50', '50-100', '100-200', '200-500', '500-1000', '1000-2500','2500-5000', '5000-10000', '10000-25000', '25000+']

In [None]:
poi_df['size_group'] = pd.cut(poi_df['Original Avg Pre Open'], bins=size_thresholds, labels=size_labels)

In [None]:
def filter_by_size_group(df):
    filtered_rows = []
    
    for index, row in df.iterrows():
        size_range = row['size_group']
        
        if isinstance(size_range, str):
            try:
                if '-' in size_range:
                    min_size, max_size = [int(x) for x in size_range.split('-')]
                else:
                    min_size, max_size = int(size_range[:-1]), float('inf')
            except ValueError:
                continue  
        else:
            continue 
        
        if min_size <= row['Avg Pre Open'] <= max_size:
            filtered_rows.append(row)
    
    filtered_df = pd.DataFrame(filtered_rows)
    
    filtered_df = filtered_df.drop_duplicates(subset=['Original POI ID', 'POI ID'], keep='first')
    
    return filtered_df

In [None]:
filtered_poi_df2 = filter_by_size_group(poi_df)
filtered_poi_df2.reset_index(drop=True, inplace=True)

In [None]:
filtered_poi_df2 = filtered_poi_df2.drop_duplicates(subset=['Original POI ID', 'POI ID'], keep='first')

In [None]:
filtered_poi_df2.reset_index(drop=True, inplace=True)

In [None]:
filtered_poi_df2.to_csv("Citylevelmatchedsynthetic2.csv", index=False)

In [None]:
filtered_poi_df2['Original POI Open Date'] = pd.to_datetime(filtered_poi_df2['Original POI Open Date'])
filtered_poi_df2['Original POI Open Year'] = filtered_poi_df2['Original POI Open Date'].dt.year

In [None]:
filtered_poi_df2['DiD1'] = filtered_poi_df2['Original Avg Post Open'] - filtered_poi_df2['Original Avg Pre Open']
filtered_poi_df2['DiD2'] = filtered_poi_df2['Avg Post Open'] - filtered_poi_df2['Avg Pre Open']
filtered_poi_df2['DiD'] = filtered_poi_df2['DiD1'] - filtered_poi_df2['DiD2']

In [None]:
filtered_poi_df2['DiD3'] = filtered_poi_df2['Original Avg Pre Open'] - filtered_poi_df2['Original Avg Pre Pre Open']
filtered_poi_df2['DiD4'] = filtered_poi_df2['Avg Pre Open'] - filtered_poi_df2['Avg Pre Pre Open']
filtered_poi_df2['DiD5'] = filtered_poi_df2['DiD3'] - filtered_poi_df2['DiD4']

## Approach to find the best match using pre-treatment difference

In [None]:
def find_best_match(df):
    matched_rows = []
    all_similar_pois = {}
    used_similar_pois = set()  
    matched_pois = set()      

    for original_poi_id in df['Original POI ID'].unique():
        subset = df[df['Original POI ID'] == original_poi_id]
        
        if not subset.empty:
            all_similar_pois[original_poi_id] = set(subset['POI ID'])
            subset['difference'] = abs(subset['DiD3'] - subset['DiD4'])
            best_match_index = subset['difference'].idxmin()
            best_match = subset.loc[best_match_index]
            matched_rows.append(best_match)
            matched_pois.add(original_poi_id)  
            used_similar_pois.add(best_match['POI ID'])  
    
    matched_df = pd.DataFrame(matched_rows)
    
    not_matched_similar_pois = []
    
    for original_poi_id, similar_pois in all_similar_pois.items():
        matched_similar_pois = used_similar_pois.intersection(similar_pois)
        dropped_similar_pois = similar_pois - matched_similar_pois
        if dropped_similar_pois:
            not_matched_similar_pois.append({
                'Original POI ID': original_poi_id,
                'Dropped Similar POIs': list(dropped_similar_pois)
            })
    
    not_matched_similar_df = pd.DataFrame(not_matched_similar_pois)
    
    num_unique_original_pois = matched_df['Original POI ID'].nunique()
    print(f'Number of unique Original POI IDs in the matched dataframe: {num_unique_original_pois}')
    print(f'Number of similar POIs dropped: {not_matched_similar_df.shape[0]}')
    
    return matched_df, not_matched_similar_df

In [None]:
matched_df, not_matched_similar_df = find_best_match(filtered_poi_df2)

In [None]:
filtered_df2=matched_df
filtered_df2

In [None]:
filtered_df2.to_csv("CityMatchedPOIsCleanedsynthetic.csv", index=False)

In [None]:
filtered_df2 = pd.read_csv('CityMatchedPOIsCleanedsynthetic.csv')

In [None]:
def calculate_distance(row):
    orig_coords = (row['orig_latitude'], row['orig_longitude'])
    sim_coords = (row['ev_latitude'], row['ev_longitude'])
    
    return geodesic(orig_coords, sim_coords).meters

filtered_df2['Distance(m)'] = filtered_df2.apply(calculate_distance, axis=1)

In [None]:
thresholds = [0, 100, 200, 300, np.inf]
labels = ['0-100m', '100-200m', '200-300m', '300-400m']

In [None]:
filtered_df2['distance_group'] = pd.cut(filtered_df2['Distance(m)'], bins=thresholds, labels=labels)

In [None]:
filtered_df2.to_csv("CityMatchedPOIsCleanedforDID.csv", index=False)