# ML pipeline

##### this notebook puts together the different parts of the pipeline covered by the external files 
 the pipeline runs as follows: 
 - Data loading
 - Features engineering 
 - Train and validation 
 - Testing 
 - Plotting the results 
 


### Data loading 

In [None]:
# raw data name
raw_data_name = 'Seattle_Real_Time_Fire_911_Calls.csv'

### Features engineering

In [None]:
!python features_engineering.py {raw_data_name}

### Train and validation 

In [None]:
import tensorflow_decision_forests as tfdf
import pandas as pd
import shutil 
import numpy as np
import pickle

In [None]:
#read data
dataset_full = pd.read_csv("clean_data.csv")
dataset_full.columns

In [None]:
#select the relevant columns for forecast
cols = ['hour', 'day', 'month', 'year', 'season', 'calls_per_hour']

dataset = dataset_full[cols]

In [None]:
#select training data as data of the five years before 2022
train_data = dataset[dataset.year.isin([2017, 2018, 2019, 2020, 2021])].reset_index()
train_data.head(4)
#test_data = dataset.

In [None]:
#select the current data (2022) as test data 
test_data = dataset[dataset.year == 2022].reset_index()
test_data.head(4)

In [None]:
#serialize train and test data to be loaded by train and test files 
train_data.to_pickle("train_data.pkl")  
test_data.to_pickle("test_data.pkl")  

In [None]:
name_train = 'train_data.pkl'

In [None]:
#train the model
!python train.py {name_train}

### testing

In [None]:
#test and evaluate the model 
!python test.py test_data.pkl

### Plotting the results 

In [None]:
#load the model 
model = pickle.load(open('./model.sav', 'rb'))

In [None]:
#use test dataset to predict the call number and compare with real data 
test_ds  = tfdf.keras.pd_dataframe_to_tf_dataset(test_data, label='calls_per_hour',task=tfdf.keras.Task.REGRESSION)
np_result = np.array([x[0] for x in  model.predict(test_ds).astype(int)])

In [None]:
#create result dataframe as a copy of test data in order to append the prediction to it 
result = test_data.reset_index()

In [None]:
#add new column for the predicted values 
result['predicted'] = pd.DataFrame(list(np_result))

In [None]:
#create a summary dataset to visualize the results 
summary = result.groupby(['year','season','month','day'])['calls_per_hour','predicted'].sum().reset_index()

In [None]:
#create a column for the weeks to help visualize the calls number per week per month 
summary['week'] = summary['day'].apply(lambda x: int(x/7)%4+1)

In [None]:
summary

In [None]:
#select data to plot i.e. month, week, real vs predicated calls number
data_plot =  summary.groupby(['month','week'])['calls_per_hour','predicted'].sum().reset_index()

In [None]:
#combine the columns week and month to vizualize all the weeks and their respective months 
data_plot['m_w'] = data_plot['month']*10 + data_plot['week']

In [None]:
#convert the month_week column to string in order to be fully display in the x axis 
data_plot['m_w']=data_plot['m_w'].astype(str)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

p=sns.lineplot(x = 'm_w', y = 'calls_per_hour',data = data_plot)
p=sns.lineplot(x = 'm_w', y = 'predicted',data = data_plot)

p.set_xlabel("weeks of 2022 (13 => 3rd week of January)")
p.set_ylabel("calls volume")
p.set_title('Comparison between the real and predicted 911 calls volume per week for Seattle in 2022')
plt.legend(labels=["calls_per_hour","predicted"])