In [1]:
# Okay, let's get the proper transformations, and also get all the predictions on a subwindow scale - Later as a proof of concept, we will separate the sequences with densities all zero and see how the predicted densities look like.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr

In [3]:
# path to dataframes
path_to_dfs = "suggestion_2/forecasted_counts/block_0103"

In [4]:
len(os.listdir(path_to_dfs))

6370

In [5]:
len(os.listdir(path_to_dfs)) / 910

7.0

In [6]:
averaged_forecasts_csvs = []
for file in os.listdir(path_to_dfs):
    if file[:8] =='averaged':
        averaged_forecasts_csvs.append(file)

In [7]:
# how many csv files do we have?
len(averaged_forecasts_csvs)

910

In [8]:
averaged_forecasts_csvs[0]

'averaged_forecasts_sub_653.csv'

In [9]:
# let's create these names in the order of the subwindows
ordered_csv_files = ['averaged_forecasts_sub_' + str(i) + '.csv' for i in range(910)]

In [11]:
# ordered_csv_files

In [12]:
%%time
all_dfs = []
for file in ordered_csv_files:
    read_df = pd.read_csv(path_to_dfs + '/' + file)
    all_dfs.append(read_df)

CPU times: user 1.59 s, sys: 297 ms, total: 1.88 s
Wall time: 23.9 s


In [13]:
# make a single dataframe out of the many
combined_df = pd.concat(all_dfs, axis = 1)

In [14]:
combined_df.head()

Unnamed: 0,True_value,Forecasted_value,True_value.1,Forecasted_value.1,True_value.2,Forecasted_value.2,True_value.3,Forecasted_value.3,True_value.4,Forecasted_value.4,...,True_value.5,Forecasted_value.5,True_value.6,Forecasted_value.6,True_value.7,Forecasted_value.7,True_value.8,Forecasted_value.8,True_value.9,Forecasted_value.9
0,0.0,-0.008622,0.0,-0.00402,0.0,-0.0136,0.0,-0.016808,5e-06,-0.003117,...,0.0,-0.007105,0.0,-0.012972,0.0,-0.008735,0.0,0.00262,0.0,0.005498
1,0.0,-0.003352,0.0,-0.001749,0.0,-0.005783,0.0,0.007102,0.064341,0.000372,...,0.0,3.3e-05,0.0,-0.001411,0.0,0.000213,0.0,0.000874,0.0,0.001592
2,0.0,-0.003265,0.0,-0.0038,0.0,-0.001575,0.0,0.001654,0.0,-0.000544,...,0.0,-0.004479,0.0,-0.006052,0.0,-0.002207,0.0,-0.006147,0.0,-0.003249
3,0.0,-0.002662,0.0,-0.003518,0.0,0.004136,0.0,0.011356,0.0,0.001134,...,0.0,-0.001618,0.0,0.006355,0.0,0.001497,0.0,-0.000994,0.0,0.003673
4,0.0,-0.003706,0.0,-0.004616,0.0,-0.00109,0.0,-0.006067,0.0,0.001924,...,0.0,-0.002414,0.0,0.002233,0.0,0.00259,0.0,-0.004778,0.0,0.003327


In [15]:
combined_df.shape

(7, 1820)

In [16]:
# get the true values
True_values_df = combined_df[['True_value']]

In [17]:
True_values_df.head()

Unnamed: 0,True_value,True_value.1,True_value.2,True_value.3,True_value.4,True_value.5,True_value.6,True_value.7,True_value.8,True_value.9,...,True_value.10,True_value.11,True_value.12,True_value.13,True_value.14,True_value.15,True_value.16,True_value.17,True_value.18,True_value.19
0,0.0,0.0,0.0,0.0,5e-06,0.000668,1.044816e-07,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.064341,0.087708,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.042056,0.01144163,0.0,0.0,0.0,...,0.0,7.683623e-10,5e-06,4.013955e-08,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.402785e-05,0.000649,8.526243e-09,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000726898,0.003864,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
True_values_df.shape

(7, 910)

In [19]:
# get the proper counts? - Do the transformation
exp_true = np.exp(True_values_df)-1

In [20]:
exp_true

Unnamed: 0,True_value,True_value.1,True_value.2,True_value.3,True_value.4,True_value.5,True_value.6,True_value.7,True_value.8,True_value.9,...,True_value.10,True_value.11,True_value.12,True_value.13,True_value.14,True_value.15,True_value.16,True_value.17,True_value.18,True_value.19
0,0.0,0.0,0.0,0.0,5.458158e-06,0.000668,1.044816e-07,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.06645597,0.091669,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.042952,0.01150733,0.0,0.0,0.0,...,0.0,7.683623e-10,5e-06,4.013955e-08,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.402814e-05,0.00065,8.526243e-09,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0007271623,0.003871,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,6.876971e-07,0.022043,0.0005086,0.0,0.0,0.0,...,0.0,0.002217396,0.005884,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.022043,9.133409e-06,0.05401,0.0004412074,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# not sure if the aggregation is necessary at this point - but let's do it anyway?
sum_true_values = exp_true.sum(axis = 1)

In [22]:
sum_true_values

0    40.000661
1    39.000002
2    41.000000
3    31.000000
4    32.000000
5    40.002086
6    27.000176
dtype: float64

In [23]:
# now what about the predicted values? we have stored these in terms of npy files, let's see how we can figure out to get the predicted values

In [25]:
# Okay, question here, not sure how the forecasted values were computed, hopefully we figure out what exatctly we need by the end of this script

In [26]:
all_forecasts_npy_files = []
for file in os.listdir(path_to_dfs):
    if file[:13] == 'all_forecasts':
        all_forecasts_npy_files.append(file)

In [27]:
len(all_forecasts_npy_files)

910

In [28]:
all_forecasts_npy_files[0]

'all_forecasts_sub_273.npy'

In [29]:
# load just the first file
sub_273_forecasts = np.load(os.path.join(path_to_dfs, all_forecasts_npy_files[0]))

In [30]:
sub_273_forecasts.shape

(1000, 4, 7)

In [31]:
# Okay, so for the subwindow 273 -  this file seem to have all the 1000 values in the 4 chains for all the test time points in the sequence

In [32]:
# transform these values?
sub_273_forecasts = np.exp(sub_273_forecasts) - 1

In [33]:
sub_273_forecasts.min(), sub_273_forecasts.max()

(-0.5817418, 1.3388767)

In [34]:
# get rid of anything below zero?
sub_273_forecasts[sub_273_forecasts < 0] = 0

In [35]:
sub_273_forecasts.min(), sub_273_forecasts.max()

(0.0, 1.3388767)

In [36]:
# cool -  what does these values mean?