In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd
import os
from timezonefinder import TimezoneFinder
import pytz

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
import random

from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly

Load the Data

In [3]:
X = pd.read_csv('../results/csvs/training_data_2022-12-20_to_2022-12-29.csv')
y = pd.read_csv('../data/IrvineSpectrumCenter_parking_2022-11_to_2023-03.csv')
lot_ids = list(X['pk_lot'].unique())
y = y[y['pk_lot'].isin(lot_ids)]

# convert timestamp to datetime when the rows have different timezones
X['timestamp'] = pd.to_datetime(X['timestamp'])
X['hour'] = X['timestamp'].dt.hour
X['day_of_week'] = X['timestamp'].dt.dayofweek
min_time = X['timestamp'].min()
max_time = X['timestamp'].max()

y['timestamp'] = pd.to_datetime(y['dt_start_date'])
y['timestamp'] = y['timestamp'].apply(lambda x: x.replace(microsecond=0))
y['timestamp'] = y['timestamp'].dt.tz_convert('US/Pacific')
y.drop(columns=['dt_start_date', 'dt_end_date', 'pk_lot_alias', 'f_occupancy_rank'], inplace=True)
# filter to be in the same time range as X
y = y[(y['timestamp'] >= min_time) & (y['timestamp'] <= max_time)]
y = y[['timestamp', 'pk_lot', 'f_pct_occ', 'i_avail', 'i_occ']]

X = X.sort_values(by=['timestamp'])
y = y.sort_values(by=['timestamp'])
display(X.head(1))
display(y.head(1))
print('X shape:', X.shape)
print('y shape:', y.shape)

merged_df = X.merge(y, on='pk_lot', suffixes=('', '_y'))

merged_df['time_diff'] = (merged_df['timestamp'] - merged_df['timestamp_y']).dt.total_seconds().abs()
df = merged_df.sort_values(by=['pk_lot', 'timestamp', 'time_diff']).drop_duplicates(subset=['pk_lot', 'timestamp'], keep='first')
df = df.drop(columns=['timestamp_y', 'time_diff'])
display(df.head(2))
print('df shape:', df.shape)

Unnamed: 0,short_hotspot,short_wasserstein,short_log_prob,short_hunting_time,short_in_out_ratio,medium_hotspot,medium_wasserstein,medium_log_prob,medium_hunting_time,medium_in_out_ratio,long_hotspot,long_wasserstein,long_log_prob,long_hunting_time,long_in_out_ratio,pk_lot,timestamp,hour,day_of_week
0,0.54124,2.536337,11.084475,85.0,0.112782,0.54124,2.256224,11.084475,85.0,0.02322,0.54124,2.420517,11.084475,85.0,0.010388,387459,2022-12-19 16:17:25-08:00,16,0


Unnamed: 0,timestamp,pk_lot,f_pct_occ,i_avail,i_occ
17550,2022-12-19 16:21:56-08:00,381380,86.0,27,


X shape: (4396, 19)
y shape: (3174, 5)


Unnamed: 0,short_hotspot,short_wasserstein,short_log_prob,short_hunting_time,short_in_out_ratio,medium_hotspot,medium_wasserstein,medium_log_prob,medium_hunting_time,medium_in_out_ratio,...,long_log_prob,long_hunting_time,long_in_out_ratio,pk_lot,timestamp,hour,day_of_week,f_pct_occ,i_avail,i_occ
1587,0.264443,3.019982,10.364312,143.0,0.197368,0.264443,2.640357,10.364312,143.0,0.032895,...,10.364312,143.0,0.01548,329825,2022-12-19 16:23:59-08:00,16,0,79.0,51,
13755,0.244699,1.811102,11.950688,86.0,0.394737,0.254571,2.399649,11.1575,114.5,0.075188,...,11.1575,114.5,0.030364,329825,2022-12-19 16:54:55-08:00,16,0,79.0,51,


df shape: (4388, 22)
