<a href="https://colab.research.google.com/github/JM0099/notebooks/blob/main/POC_Kaggle_Boston_Accidents_Time_Series.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Get Basic Host Information

In [None]:
import os
import multiprocessing

mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')  # e.g. 4015976448
mem_gib = mem_bytes/(1024.**3)  # e.g. 3.74

print(f"Total number of CPUS: {multiprocessing.cpu_count()}")
print(f"Total available memory: {mem_gib}")

Total number of CPUS: 2
Total available memory: 12.678386688232422


### Get a data set from Kaggle
This dataset, part of the Vision Zero Boston program, contains records of the date, time, location, and type of crash for incidents requiring public safety response. The data begins in January 2015 and is updated monthly after a manual confirmation. Exact updating schedules may vary, but this script will pull new data on the 15th of each month.


https://www.kaggle.com/code/patrickdrew/boston-crash-records

In [None]:
!pip install -q kaggle

In [None]:
import json
kaggle = {"username":"mydrpepper","key":"59655bff5220e23b1a6bd1f3bbfcd83e"}

with open("kaggle.json", "w") as fh:
    fh.write(json.dumps(kaggle))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle kernels output patrickdrew/boston-crash-records

Output file downloaded to /content/boston_crash_records.csv
Output file downloaded to /content/boston_crash_records.parquet
Kernel log downloaded to /content/boston-crash-records.log 


In [None]:
#url="https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv"
#c=pd.read_csv(url)



Choose a CSV file to upload: 



### Prune and graph the data

In [None]:
import pandas as pd

# Read in the data.
df = pd.read_csv('/content/boston_crash_records.csv')

# Drop columns we are not interested in.
df.drop(columns=['_id', 'location_type', 'street', 'xstreet1', 'xstreet2', 'x_cord', 'y_cord', 'lat', 'long'], inplace=True)

# Get only the data on bikes.
df = df[df.mode_type == 'bike']

# Rename a few columns.
df.rename(columns={'dispatch_ts':'date', 'mode_type': 'bike_accidents'}, inplace=True)

# Convert the date field to a python datetime type.
df['date'] = pd.to_datetime(df['date'])

# Change the word "bike" to the number 1.
df = df.replace('bike', 1)

# Display the number of records (in daily format).
print(f"Number of records: {len(df)}")
df.head()

Number of records: 2868


Unnamed: 0,date,bike_accidents
3,2015-01-01 18:23:57+00:00,1
12,2015-01-02 22:27:44+00:00,1
26,2015-01-05 08:08:13+00:00,1
30,2015-01-05 12:00:18+00:00,1
104,2015-01-12 13:28:03+00:00,1


In [None]:
# Create a new dataframe with data summed for the month.
df_new = (df.assign(Day=df['date']).set_index('Day').resample('D').sum())
df_new = pd.DataFrame(df_new)
df_new.tail()



The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0_level_0,bike_accidents
Day,Unnamed: 1_level_1
2022-11-29 00:00:00+00:00,0
2022-11-30 00:00:00+00:00,1
2022-12-01 00:00:00+00:00,3
2022-12-02 00:00:00+00:00,0
2022-12-03 00:00:00+00:00,1


In [None]:
# Plot the number of bike accidents.
import plotly.express as px
fig = px.line(df_new)
fig.show()
print(f"Number of records: {len(df_new)}")

Number of records: 2894


### Create a model and try to predict further bike accidents

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
pd.options.mode.chained_assignment = None
tf.random.set_seed(0)

In [None]:
df_new = df_new.reset_index()
print( df_new.head() )

                        Day  bike_accidents
0 2015-01-01 00:00:00+00:00               1
1 2015-01-02 00:00:00+00:00               1
2 2015-01-03 00:00:00+00:00               0
3 2015-01-04 00:00:00+00:00               0
4 2015-01-05 00:00:00+00:00               2


In [None]:
df_1 = df_new['Day'].fillna(method='ffill')
df_1 = df_1.values.reshape(-1, 1)

scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(df_1)
df_1 = scaler.transform(df_1)

In [None]:

X = []
Y = []

#>>>  generate the input and output sequences <<<
n_lookback = 90  # length of input sequences (lookback period)
n_forecast = 30  # length of output sequences (forecast period)

for i in range(n_lookback, len(df_1) - n_forecast + 1):
    X.append(df_1[i - n_lookback: i])
    Y.append(df_1[i: i + n_forecast])

X = np.array(X)
Y = np.array(Y)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.33,random_state=42)

model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(n_lookback, 1)))
model.add(LSTM(units=50))
model.add(Dense(n_forecast))

model.compile(loss='mean_squared_error', optimizer='adam', metrics=[tf.keras.metrics.Accuracy()])
print(model.summary())
model.fit(X, Y, epochs=5, batch_size=128, verbose=1, validation_data=(X_test,y_test))

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_32 (LSTM)              (None, 90, 50)            10400     
                                                                 
 lstm_33 (LSTM)              (None, 50)                20200     
                                                                 
 dense_16 (Dense)            (None, 30)                1530      
                                                                 
Total params: 32,130
Trainable params: 32,130
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7d1a518e65f0>

In [None]:
X_ = df_1[- n_lookback:]  # last available input sequence

# The model is in a 3D format, so the data needs to be
# modified to reflect this.
X_ = X_.reshape(1, n_lookback, 1)

# The model is going to return an output of whatever
# n_forecast was when the model was trained.
Y_ = model.predict(X_).reshape(-1, 1)

Y_ = scaler.inverse_transform(Y_)
print( Y_.flatten() )


[1.6702919e+18 1.6700745e+18 1.6696345e+18 1.6701701e+18 1.6701140e+18
 1.6703173e+18 1.6697985e+18 1.6709040e+18 1.6709579e+18 1.6704669e+18
 1.6695181e+18 1.6708111e+18 1.6703888e+18 1.6705172e+18 1.6715768e+18
 1.6718473e+18 1.6716116e+18 1.6709901e+18 1.6709115e+18 1.6707956e+18
 1.6717178e+18 1.6718551e+18 1.6712357e+18 1.6720588e+18 1.6707606e+18
 1.6714445e+18 1.6720475e+18 1.6720427e+18 1.6724164e+18 1.6719579e+18]


In [None]:
print( df_new )
df_past = df_new[['bike_accidents']].reset_index()

df_past['Day'] = pd.to_datetime(df_new['Day'])
df_past['Forecast'] = np.nan
df_past['Forecast'].iloc[-1] = df_past['bike_accidents'].iloc[-1]


                           Day  bike_accidents
0    2015-01-01 00:00:00+00:00               1
1    2015-01-02 00:00:00+00:00               1
2    2015-01-03 00:00:00+00:00               0
3    2015-01-04 00:00:00+00:00               0
4    2015-01-05 00:00:00+00:00               2
...                        ...             ...
2889 2022-11-29 00:00:00+00:00               0
2890 2022-11-30 00:00:00+00:00               1
2891 2022-12-01 00:00:00+00:00               3
2892 2022-12-02 00:00:00+00:00               0
2893 2022-12-03 00:00:00+00:00               1

[2894 rows x 2 columns]


In [None]:
from pprint import pprint

df_future = pd.DataFrame(columns=['Day', 'bike_accidents', 'Forecast'])
df_future['Day'] = pd.date_range(start=df_past['Day'].iloc[-1] + pd.Timedelta(days=1), periods=n_forecast)
df_future['Forecast'] = Y_.flatten()

results = df_past.append(df_future)
results = results.set_index('Day')
results.drop(columns=['index'], inplace=True)

pprint(results)
print()

# Verify all the columns are of the same type.
print(results.columns)
for c in results.columns:
    print(f"{c}: {type(results[c].iloc[-1])}")

# Convert to numeric so that it can be graphed.
results['bike_accidents'] = pd.to_numeric(results['bike_accidents'], errors='coerce')

                          bike_accidents      Forecast
Day                                                   
2015-01-01 00:00:00+00:00              1           NaN
2015-01-02 00:00:00+00:00              1           NaN
2015-01-03 00:00:00+00:00              0           NaN
2015-01-04 00:00:00+00:00              0           NaN
2015-01-05 00:00:00+00:00              2           NaN
...                                  ...           ...
2022-12-29 00:00:00+00:00            NaN  1.671444e+18
2022-12-30 00:00:00+00:00            NaN  1.672048e+18
2022-12-31 00:00:00+00:00            NaN  1.672043e+18
2023-01-01 00:00:00+00:00            NaN  1.672416e+18
2023-01-02 00:00:00+00:00            NaN  1.671958e+18

[2924 rows x 2 columns]

Index(['bike_accidents', 'Forecast'], dtype='object')
bike_accidents: <class 'float'>
Forecast: <class 'numpy.float64'>



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [None]:
fig = px.line(results)
fig.show()