In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr

In [None]:
# path to dataframes
path_to_dfs = "forecasted_counts/block_0106/"

In [None]:
# os.listdir(path_to_dfs)

In [None]:
averaged_forecasts_csvs = []
for file in os.listdir(path_to_dfs):
    if file[:8] =='averaged':
        averaged_forecasts_csvs.append(file)

In [None]:
averaged_forecasts_csvs.sort()
averaged_forecasts_csvs

In [None]:
# add the 10, 11 at the end
csv_files_10_11 = ['averaged_forecasts_sub_10.csv', 'averaged_forecasts_sub_11.csv']

In [None]:
other_files = [i for i in averaged_forecasts_csvs if i not in csv_files_10_11]

In [None]:
ordered_csv_files = other_files + csv_files_10_11

In [None]:
ordered_csv_files

In [None]:
all_dfs = []
for file in ordered_csv_files:
    read_df = pd.read_csv(path_to_dfs + '/' + file)
    all_dfs.append(read_df)

In [None]:
# make a single dataframe out of the many
combined_df = pd.concat(all_dfs, axis = 1)

In [None]:
combined_df.head()

In [None]:
combined_df.shape

In [None]:
# get the true values
True_values_df = combined_df[['True_value']]

In [None]:
True_values_df

In [None]:
# sum the true values - since the sub images are not overlapping, we can get the sum across columns in the above dataframe
sum_true_values = True_values_df.sum(axis = 1)

In [None]:
sum_true_values

In [None]:
# get the forecasted values
Forecasted_values_df = combined_df[['Forecasted_value']]

In [None]:
Forecasted_values_df

In [None]:
Forecasted_values_df.shape

In [None]:
sum_forecasted_values = Forecasted_values_df.sum(axis = 1)

In [None]:
sum_forecasted_values

In [None]:
# concatenate the sum dataframes
block_0106_true_and_forecasted_values_df = pd.concat((sum_true_values, sum_forecasted_values), axis = 1)

In [None]:
block_0106_true_and_forecasted_values_df.columns = ["True_count", "Forecasted_count"]

In [None]:
block_0106_true_and_forecasted_values_df

In [None]:
block_0106_true_and_forecasted_values_df.to_csv("final_forecasted_counts/block_0106_finals_forecasts.csv", index = False)

In [None]:
# # well we need to verify that the above true values are indeed correct
# from preprocess script we have the following counts
# [43, 49, 53, 59, 45, 42, 34, 39, 37, 43, 41, 39, 43, 40, 39, 41, 31, 32, 40, 29] # the last seven numbers match with what we have.

In [None]:
# blockwise mean squared error
rmse = np.sqrt(mean_squared_error(sum_true_values, sum_forecasted_values))
rmse

In [None]:
# blockwise mean absolute error
mae = mean_absolute_error(sum_true_values, sum_forecasted_values)
mae

In [None]:
corr = pearsonr(sum_true_values, sum_forecasted_values)
corr

In [None]:
# May be define a function for this, so that it will be easier to get the forecasted dataframes for all blocks

In [None]:
# we need first the path to dfs per block
def get_final_forecasted_dfs(path_to_dfs_in_block, block_name, true_values_col_name, forecasted_values_col_name, forecast_path):
    # get the csv files that have the averaged 
    average_frcts_csv_files = [file for file in os.listdir(path_to_dfs_in_block) if file[:8] == 'averaged']
    # sort these files
    average_frcts_csv_files.sort()
    # get the later images to the end of the list
    csv_files_10_11 = ['averaged_forecasts_sub_10.csv', 'averaged_forecasts_sub_11.csv']
    # remove these from the total list
    other_files = [i for i in average_frcts_csv_files if i not in csv_files_10_11]
    # add the csv files in order
    ordered_csv_files = other_files + csv_files_10_11
    # print the ordered list of csv files
    print(ordered_csv_files)
    # read and append the list of the dfs
    all_dfs = [pd.read_csv(path_to_dfs_in_block + '/' + df) for df in ordered_csv_files]
    # combine all these dfs together
    combined_df = pd.concat(all_dfs, axis = 1)
    print(combined_df.shape)
    # extract the true value columns only across the sub-images
    True_counts_df = combined_df[[true_values_col_name]]
    # sum the true values dfs
    total_true_values = True_counts_df.sum(axis = 1)
    # print these true values for later comparisons
    print(total_true_values)
    # extract the true value columns across sub images
    Forecasted_counts_df = combined_df[[forecasted_values_col_name]]
    # sum the forecasted values
    total_forecasted_values = Forecasted_counts_df.sum(axis = 1)
    # concatenate the sum dataframes
    true_and_forecasted_values_df = pd.concat((total_true_values, total_forecasted_values), axis = 1)
    # add column titles to the df
    true_and_forecasted_values_df.columns = ["True_count", "Forecasted_count"]
    # save this file
    file_name = forecast_path + '/' + block_name + '.csv'
    true_and_forecasted_values_df.to_csv(file_name, index = False)
    # blockwise mean squared error
    rmse = np.sqrt(mean_squared_error(total_true_values, total_forecasted_values))
    # blockwise mae
    mae = mean_absolute_error(total_true_values, total_forecasted_values)

    return(true_and_forecasted_values_df, rmse, mae,ordered_csv_files)

In [None]:
# see if the function works
df, rmse_0106, mae_0106, ordered_files_0106 = get_final_forecasted_dfs(path_to_dfs, 'block_0106', 'True_value', 'Forecasted_value', 'final_forecasted_counts')

In [None]:
df

In [None]:
rmse_0106

In [None]:
mae_0106

In [None]:
ordered_files_0106

In [None]:
# Getting the coverages and the widths for the forecasted values - This can be done with the saved all forecasts npy files
# location for the forecast files (all forecasts)
location_all_forecasts = 'forecasted_counts/block_0106'

# contents at this location
all_contents = os.listdir(location_all_forecasts)
all_contents.sort()

In [None]:
# we need the npy files for all forecasts
all_forecast_files = [file for file in all_contents if file[:3] == 'all']

In [None]:
# arange the files in order
later_npy_files = ['all_forecasts_sub_10.npy', 'all_forecasts_sub_11.npy']
first_files = [file for file in all_forecast_files if file not in later_npy_files]

In [None]:
final_all_forecast_files = first_files + later_npy_files

In [None]:
loaded_npy_files = []
for file in final_all_forecast_files:
    joined_path = os.path.join(location_all_forecasts, file)
    load_file = np.load(joined_path)
    loaded_npy_files.append(load_file)

In [None]:
len(loaded_npy_files)

In [None]:
output = sum(loaded_npy_files)

In [None]:
output.shape

In [None]:
final_array = output.reshape(4000,7)

In [None]:
final_array

In [None]:
li_train = np.percentile(final_array, axis = 0, q = (2.5, 97.5))[0,:].reshape(-1,1)    
ui_train = np.percentile(final_array, axis = 0, q = (2.5, 97.5))[1,:].reshape(-1,1)

In [None]:
li_train

In [None]:
ui_train

In [None]:
width_train = ui_train - li_train
avg_width_train = width_train.mean(0)[0]

In [None]:
avg_width_train

In [None]:
y_traina = block_0106_true_and_forecasted_values_df[["True_count"]].values

In [None]:
ind_train = (y_traina >= li_train) & (y_traina <= ui_train)
coverage_train= ind_train.mean(0)[0]

In [None]:
coverage_train

Verify the true counts we have for the test data are indeed correct

In [None]:
# Verify the true counts - from np arrays - location Block_0103/sub_images_and_counts

sub_count_loc = 'all_preprocessed_data/Block_0106/sub_images_and_counts'

In [None]:
sub_density_maps = [i for i in os.listdir(sub_count_loc) if i.split(".")[0][-7:] == 'density']
sub_density_maps.sort()

In [None]:
# get the test dates
test_time_periods = ['2020_08_26', '2020_08_27', '2020_08_28' ,'2020_08_31', '2020_09_02', '2020_09_07', '2020_09_16']

In [None]:
test_time_periods = ['Block0106_' + i for i in test_time_periods]

In [None]:
# print the test dates
test_time_periods

In [None]:
# get the density maps for these days only for the computation of the total true counts of the test images
test_density_maps = [i for i in sub_density_maps if i[:20] in test_time_periods]
test_density_maps.sort()

In [None]:
%%time
# get the true counts
true_counts_in_order = []
step = 0
# we have only seven time points
for u in range(7):
    catch_counts = []
    # for each time steps we have 12 images, and since the sub windows are not overlapping we can add the values straightaway eben without sorting
    for j in range(step, step + 12):
        total_count = np.sum(np.load(os.path.join(sub_count_loc, test_density_maps[j])))
        catch_counts.append(total_count)
    true_counts_in_order.append(np.sum(catch_counts))
    step = step + 12

In [None]:
true_counts_in_order

In [None]:
df[['True_count']].values.flatten()

In [None]:
# see if the two values we get match 

In [None]:
np.mean(np.round(true_counts_in_order, 0) == np.round(df[['True_count']].values.flatten(), 0))