In [13]:
import tensorflow as tf
import pandas as pd
from datetime import datetime

import numpy as np
from tensorflow import keras
from tensorflow.keras import layers


import warnings
warnings.filterwarnings("ignore")

## Utils

In [14]:
def read_arrests():
    # Loading Arrests dataset
    arrests_df = pd.read_csv('../data/Pulled/Opioid_Seizures_and_Arrests_CY_2013_-_Current_Quarterly_County_State_Police.csv')

    # removing the index column and removing any duplicates in the initial dataset
    arrests_df.drop('Unnamed: 0', axis=1, inplace=True)
    arrests_df.drop_duplicates(inplace=True)

    keep_columns = ['County Name', 'Year', 'Qtr', 'Drug', 'Incident Count', 'Drug Quantity', 'Arrests']
    arrests_df = arrests_df[keep_columns]
    
    arrests_df = arrests_df.groupby(['County Name', 'Year', 'Qtr', 'Drug']).sum().reset_index()
    arrests_df.rename(columns={'County Name': 'County',
                               'Incident Count': 'Count-seized',
                               'Qtr': 'Quarter'
                               },
                      inplace=True)
    
    arrests_df = arrests_df.pivot(index=['County', 'Year', 'Quarter'],
                                  columns='Drug',
                                  values=['Count-seized', 'Arrests', 'Drug Quantity']).reset_index()

    arrests_df.columns = arrests_df.columns.map('-'.join).str.strip('-')
    arrests_df.columns = arrests_df.columns = [s.replace('-', ' - ') for s in arrests_df.columns]

    arrests_df = arrests_df.fillna(0)
    
    return arrests_df

In [15]:
def create_timeseries_data(county = "Philadelphia"):
    incidents_df = pd.read_csv('../data/Aggregated/incidents.csv')
    
    incidents_df = incidents_df[incidents_df['Incident County Name'] == county]
    
    incidents_df['Quarter'] = incidents_df['Incident Date'].apply(lambda x: "Q" + str(pd.Timestamp(datetime.strptime(x,"%m/%d/%Y")).quarter))
    
    incidents_df["Fentanyl"] = incidents_df["All Drugs"].apply(lambda x: 1 if "FENTANYL" in x else 0)
    
    incidents_df["Heroin"] = incidents_df["All Drugs"].apply(lambda x: 1 if "HEROIN" in x else 0)
    
    incidents_df["Year"] = incidents_df["Incident Date"].apply(lambda x: datetime.strptime(x,"%m/%d/%Y").year)
    
    incidents_df['Count'] = 1

    return incidents_df
    

In [24]:
test = read_arrests()
time_series_df = create_timeseries_data()

In [25]:
ls = ["Incident County Name", "Naloxone Administered", "All Drugs", "Quarter", "Year", "Fentanyl", "Heroin", "Count"]
time_series_df = time_series_df[ls]
time_series_df = time_series_df.groupby(['Incident County Name',
                                         'Year',
                                         'Quarter']).sum().reset_index()


# Sanity Check
test[(test['County'] == 'Philadelphia') & (test['Year'] > 2017)]

Unnamed: 0,County,Year,Quarter,Count - seized - Fentanyl,Count - seized - Heroin,Count - seized - Opium,Arrests - Fentanyl,Arrests - Heroin,Arrests - Opium,Drug Quantity - Fentanyl,Drug Quantity - Heroin,Drug Quantity - Opium
1690,Philadelphia,2018,Q1,3.0,23.0,0.0,1.0,24.0,0.0,0.0082,4.943441,0.0
1691,Philadelphia,2018,Q2,4.0,38.0,1.0,4.0,29.0,1.0,6.345,0.614977,1e-06
1692,Philadelphia,2018,Q3,2.0,43.0,0.0,4.0,44.0,0.0,3.0023,5.531291,0.0
1693,Philadelphia,2018,Q4,2.0,26.0,0.0,1.0,32.0,0.0,5.001,0.966192,0.0
1694,Philadelphia,2019,Q1,5.0,51.0,0.0,5.0,60.0,0.0,0.74392,1.655758,0.0
1695,Philadelphia,2019,Q2,17.0,73.0,0.0,15.0,62.0,0.0,6.25695,11.601448,0.0
1696,Philadelphia,2019,Q3,10.0,53.0,0.0,8.0,48.0,0.0,4.22928,7.77491,0.0
1697,Philadelphia,2019,Q4,12.0,47.0,0.0,3.0,56.0,0.0,2.05995,0.193859,0.0
1698,Philadelphia,2020,Q1,25.0,31.0,0.0,7.0,28.0,0.0,4.65836,0.52653,0.0
1699,Philadelphia,2020,Q2,8.0,13.0,0.0,5.0,8.0,0.0,2.38036,0.35948,0.0


In [26]:
time_series_df.rename(columns={'Incident County Name': 'County'}, inplace=True)
joined_df = time_series_df.merge(test, right_on=['County', 'Year', 'Quarter'], left_on=['County', 'Year', 'Quarter'])

joined_df

Unnamed: 0,County,Year,Quarter,Naloxone Administered,Fentanyl,Heroin,Count,Count - seized - Fentanyl,Count - seized - Heroin,Count - seized - Opium,Arrests - Fentanyl,Arrests - Heroin,Arrests - Opium,Drug Quantity - Fentanyl,Drug Quantity - Heroin,Drug Quantity - Opium
0,Philadelphia,2018,Q1,8,0,7,9,3.0,23.0,0.0,1.0,24.0,0.0,0.0082,4.943441,0.0
1,Philadelphia,2018,Q2,9,1,9,9,4.0,38.0,1.0,4.0,29.0,1.0,6.345,0.614977,1e-06
2,Philadelphia,2018,Q3,19,6,15,20,2.0,43.0,0.0,4.0,44.0,0.0,3.0023,5.531291,0.0
3,Philadelphia,2018,Q4,3,0,3,3,2.0,26.0,0.0,1.0,32.0,0.0,5.001,0.966192,0.0
4,Philadelphia,2019,Q1,9,3,10,10,5.0,51.0,0.0,5.0,60.0,0.0,0.74392,1.655758,0.0
5,Philadelphia,2019,Q2,9,2,9,10,17.0,73.0,0.0,15.0,62.0,0.0,6.25695,11.601448,0.0
6,Philadelphia,2019,Q3,15,6,16,16,10.0,53.0,0.0,8.0,48.0,0.0,4.22928,7.77491,0.0
7,Philadelphia,2019,Q4,3,1,3,4,12.0,47.0,0.0,3.0,56.0,0.0,2.05995,0.193859,0.0
8,Philadelphia,2020,Q1,3,2,3,4,25.0,31.0,0.0,7.0,28.0,0.0,4.65836,0.52653,0.0
9,Philadelphia,2020,Q2,9,3,8,9,8.0,13.0,0.0,5.0,8.0,0.0,2.38036,0.35948,0.0


# Nerual Netwrok

In [9]:
y = joined_df["Count"]
joined_df.drop(["Count"], axis = 1, inplace = True)
ls = joined_df.columns.tolist()
ls = [i for i in ls if i not in ("County", "Year", "Quarter")]

In [10]:
def split_test(features_df: pd.DataFrame, target_series: pd.DataFrame, split_percentage: float):
    train_len = int(features_df.shape[0] * split_percentage) + 1
    
    return features_df.values[:train_len,:], target_series.values[:train_len], features_df.values[train_len:, :], target_series.values[train_len:]

In [13]:
x_train, y_train, x_test, y_test = split_test(joined_df[ls],y, 0.8)

In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# test_df = joined_df.copy()
scalers = []
x_train_scaled = []

for i, col in enumerate(ls):
    _ = scaler.fit(x_train[:, i].reshape(-1, 1))
    scalers.append(_)
    x_train_scaled.append(tf.convert_to_tensor(_.transform(x_train[:, i].reshape(-1, 1))))

y_scaler = scaler.fit(y_train.reshape(-1,1))
y_train_scaled = tf.convert_to_tensor(y_scaler.transform(y_train.reshape(-1, 1)))

In [85]:
"""
Thanks to Nicolas Grevais for the code below:
https://stackoverflow.com/a/65567559
"""
inputs, outputs = zip(*tf.keras.preprocessing.timeseries_dataset_from_array(x_train_scaled, y_train_scaled,
                                                          sequence_length=3,
                                                          batch_size=2))
inputs = tf.squeeze(inputs).numpy()
outputs = tf.squeeze(outputs).numpy()
# # Sanity Check
# for present_values, next_value in ds.take(5):
#     print(tf.squeeze(present_values).numpy(), '-->', next_value.numpy())

In [121]:
# Steps below were taken from Keras documentation
model = keras.Sequential()
# Add an Embedding layer expecting input vocab of size 1000, and
# # output embedding dimension of size 64.
# model.add(layers.Embedding(input_dim=12, output_dim=1))

# Add a LSTM layer with 128 internal units.
model.add(layers.LSTM(units = 1, batch_input_shape = (2, 3, 17), input_shape=(2, 3, 17), stateful=True ))

# Add a Dense layer with 1 units.
model.add(layers.Dense(1))

model.summary()


Model: "sequential_31"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_29 (LSTM)              (2, 1)                    76        
                                                                 
 dense_14 (Dense)            (2, 1)                    2         
                                                                 
Total params: 78
Trainable params: 78
Non-trainable params: 0
_________________________________________________________________


In [124]:
model.compile(loss = "mean_absolute_percentage_error",
              optimizer = tf.keras.optimizers.experimental.SGD(learning_rate=0.001), 
              metrics = ['MSE'])


model.fit(
    inputs, outputs, epochs=100
)

Epoch 1/100


ValueError: in user code:

    File "/usr/local/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/site-packages/keras/engine/training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "/usr/local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.10/site-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_31" is incompatible with the layer: expected shape=(None, 3, 17), found shape=(None, 2, 3, 17)


In [None]:
preds = model.predict(x_test)

In [43]:
preds, y_test

(array([[3.9912987],
        [3.9912987],
        [7.994218 ],
        [3.9912987],
        [3.9912987],
        [3.9912987],
        [4.6682606],
        [4.293782 ],
        [3.991299 ]], dtype=float32),
 array([  4,  10,  42, 100,  95,  96,  68, 124,  15]))