In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from avaml.aggregatedata import ForecastDataset, LabeledData, REG_ENG, CsvMissingError

# Read in data

In [2]:
model_prefix = ''
days = 7
regobs_types = list(REG_ENG.keys())
labeled_data = None
try:
    print("Reading csv")
    labeled_data = LabeledData.from_csv(days=days, regobs_types=regobs_types, with_varsom=True)
except CsvMissingError:
    print("Csv missing. Fetching online data. (This takes a long time.)")
    labeled_data = ForecastDataset(regobs_types=regobs_types).label(days=days, with_varsom=True)
    labeled_data.to_csv()

Reading csv


# EDA

In [3]:
labeled_data

<avaml.aggregatedata.LabeledData at 0x7f61d0091a90>

In [4]:
vars(labeled_data).keys()

dict_keys(['data', 'row_weight', 'label', 'pred', 'days', 'with_varsom', 'regobs_types', 'single', 'seasons'])

In [5]:
vars(labeled_data)

{'data':                 region_id_3001 region_id_3002 region_id_3003 region_id_3004  \
                              0              0              0              0   
 2017-12-07 3003            0.0            0.0            1.0            0.0   
            3007            0.0            0.0            0.0            0.0   
            3009            0.0            0.0            0.0            0.0   
            3010            0.0            0.0            0.0            0.0   
            3011            0.0            0.0            0.0            0.0   
 ...                        ...            ...            ...            ...   
 2021-01-12 3031            0.0            0.0            0.0            0.0   
            3032            0.0            0.0            0.0            0.0   
            3034            0.0            0.0            0.0            0.0   
            3035            0.0            0.0            0.0            0.0   
            3037            0.0 

In [6]:
labeled_data.label

Unnamed: 0_level_0,Unnamed: 1_level_0,CLASS,CLASS,CLASS,CLASS,CLASS,CLASS,CLASS,CLASS,CLASS,CLASS,...,REAL,REAL,REAL,REAL,REAL,REAL,REAL,REAL,REAL,REAL
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,drift-slab,drift-slab,drift-slab,drift-slab,...,new-loose,new-loose,new-slab,new-slab,pwl-slab,pwl-slab,wet-loose,wet-loose,wet-slab,wet-slab
Unnamed: 0_level_2,Unnamed: 1_level_2,danger_level,emergency_warning,problem_1,problem_2,problem_3,problem_amount,cause,dist,dsize,lev_fill,...,lev_max,lev_min,lev_max,lev_min,lev_max,lev_min,lev_max,lev_min,lev_max,lev_min
2017-12-07,3003,2,Ikke gitt,drift-slab,,,1,snowdrift,1,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-12-07,3007,2,Ikke gitt,pwl-slab,drift-slab,,2,new-snow,2,2,1,...,0.0,0.0,0.0,0.0,700.0,200.0,0.0,0.0,0.0,0.0
2017-12-07,3009,2,Ikke gitt,pwl-slab,drift-slab,,2,new-snow,2,2,1,...,0.0,0.0,0.0,0.0,700.0,200.0,0.0,0.0,0.0,0.0
2017-12-07,3010,3,Ikke gitt,drift-slab,pwl-slab,,2,new-snow,3,2,1,...,0.0,0.0,0.0,0.0,700.0,200.0,0.0,0.0,0.0,0.0
2017-12-07,3011,3,Ikke gitt,drift-slab,pwl-slab,,2,new-snow,3,2,1,...,0.0,0.0,0.0,0.0,700.0,200.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-12,3031,3,Ikke gitt,pwl-slab,,,1,0,0,0,0,...,0.0,0.0,0.0,0.0,600.0,600.0,0.0,0.0,0.0,0.0
2021-01-12,3032,3,Ikke gitt,pwl-slab,,,1,0,0,0,0,...,0.0,0.0,0.0,0.0,800.0,800.0,0.0,0.0,0.0,0.0
2021-01-12,3034,3,Ikke gitt,pwl-slab,,,1,0,0,0,0,...,0.0,0.0,0.0,0.0,600.0,600.0,0.0,0.0,0.0,0.0
2021-01-12,3035,3,Ikke gitt,pwl-slab,,,1,0,0,0,0,...,0.0,0.0,0.0,0.0,800.0,800.0,0.0,0.0,0.0,0.0


In [7]:
labeled_data.data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,region_id_3001,region_id_3002,region_id_3003,region_id_3004,region_id_3005,region_id_3006,region_id_3007,region_id_3008,region_id_3009,region_id_3010,...,regobs_snowprofile_t_min_4,regobs_snowprofile_t_min_4,regobs_snowprofile_t_min_4,regobs_snowprofile_t_min_4,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,0,0,0,0,0,0,0,0,0,0,...,4,5,6,7,2,3,4,5,6,7
2017-12-07,3003,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-12-07,3007,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-12-07,3009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-12-07,3010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-12-07,3011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
labeled_data.data.shape

(12651, 6562)

In [9]:
labeled_data.data.index

MultiIndex([('2017-12-07', 3003),
            ('2017-12-07', 3007),
            ('2017-12-07', 3009),
            ('2017-12-07', 3010),
            ('2017-12-07', 3011),
            ('2017-12-07', 3012),
            ('2017-12-07', 3013),
            ('2017-12-07', 3014),
            ('2017-12-07', 3015),
            ('2017-12-07', 3016),
            ...
            ('2021-01-12', 3023),
            ('2021-01-12', 3024),
            ('2021-01-12', 3027),
            ('2021-01-12', 3028),
            ('2021-01-12', 3029),
            ('2021-01-12', 3031),
            ('2021-01-12', 3032),
            ('2021-01-12', 3034),
            ('2021-01-12', 3035),
            ('2021-01-12', 3037)],
           length=12651)

In [10]:
# get first level index, date
labeled_data.data.index.get_level_values(0)

Index(['2017-12-07', '2017-12-07', '2017-12-07', '2017-12-07', '2017-12-07',
       '2017-12-07', '2017-12-07', '2017-12-07', '2017-12-07', '2017-12-07',
       ...
       '2021-01-12', '2021-01-12', '2021-01-12', '2021-01-12', '2021-01-12',
       '2021-01-12', '2021-01-12', '2021-01-12', '2021-01-12', '2021-01-12'],
      dtype='object', length=12651)

In [11]:
# get second level index, region
labeled_data.data.index.get_level_values(1)

Int64Index([3003, 3007, 3009, 3010, 3011, 3012, 3013, 3014, 3015, 3016,
            ...
            3023, 3024, 3027, 3028, 3029, 3031, 3032, 3034, 3035, 3037],
           dtype='int64', length=12651)

### Clustering data by region and time

I am interested in understanding how certain input variables are correlated to the output variables of interest. This includes how different snow pack conditions vary with avlanche problems and danger level warnings, but also how these input and output variables change with time, across different regions, etc.

The data is formatted as a MultiIndex in Pandas. This means we can first subset by time, then by region ID.

In [12]:
# look at df from one day
labeled_data.data.loc['2017-12-07'].shape

(21, 6562)

In [13]:
labeled_data.data.loc['2017-12-07']

Unnamed: 0_level_0,region_id_3001,region_id_3002,region_id_3003,region_id_3004,region_id_3005,region_id_3006,region_id_3007,region_id_3008,region_id_3009,region_id_3010,...,regobs_snowprofile_t_min_4,regobs_snowprofile_t_min_4,regobs_snowprofile_t_min_4,regobs_snowprofile_t_min_4,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy
Unnamed: 0_level_1,0,0,0,0,0,0,0,0,0,0,...,4,5,6,7,2,3,4,5,6,7
3003,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3007,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# now subset by day and region
labeled_data.data.loc['2017-12-07', 3003]

region_id_3001  0    0.0
region_id_3002  0    0.0
region_id_3003  0    1.0
region_id_3004  0    0.0
region_id_3005  0    0.0
                    ... 
accuracy        3    0.0
                4    0.0
                5    0.0
                6    0.0
                7    0.0
Name: (2017-12-07, 3003), Length: 6562, dtype: float64

**Does the `LabeledData` class have a method that will drop extra region columns (`region_id_3001`, for example) and just get the columns of that pertain to the region of interest?**

In [15]:
# first, rename indices
labeled_data.data = labeled_data.data.rename_axis(['date','region'])

In [16]:
labeled_data.data

Unnamed: 0_level_0,Unnamed: 1_level_0,region_id_3001,region_id_3002,region_id_3003,region_id_3004,region_id_3005,region_id_3006,region_id_3007,region_id_3008,region_id_3009,region_id_3010,...,regobs_snowprofile_t_min_4,regobs_snowprofile_t_min_4,regobs_snowprofile_t_min_4,regobs_snowprofile_t_min_4,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,0,0,0,0,0,0,0,0,0,0,...,4,5,6,7,2,3,4,5,6,7
date,region,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2017-12-07,3003,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-12-07,3007,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-12-07,3009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-12-07,3010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-12-07,3011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-12,3031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-12,3032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-12,3034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-12,3035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# try to groupby hierarchical index
labeled_data.data.groupby(['region']).groups

{3003: MultiIndex([('2017-12-07', 3003),
             ('2017-12-08', 3003),
             ('2017-12-09', 3003),
             ('2017-12-10', 3003),
             ('2017-12-11', 3003),
             ('2017-12-12', 3003),
             ('2017-12-13', 3003),
             ('2017-12-14', 3003),
             ('2017-12-15', 3003),
             ('2017-12-16', 3003),
             ...
             ('2021-01-03', 3003),
             ('2021-01-04', 3003),
             ('2021-01-05', 3003),
             ('2021-01-06', 3003),
             ('2021-01-07', 3003),
             ('2021-01-08', 3003),
             ('2021-01-09', 3003),
             ('2021-01-10', 3003),
             ('2021-01-11', 3003),
             ('2021-01-12', 3003)],
            names=['date', 'region'], length=568),
 3006: MultiIndex([('2018-04-01', 3006),
             ('2020-02-13', 3006),
             ('2020-02-14', 3006),
             ('2020-02-15', 3006),
             ('2020-02-16', 3006),
             ('2020-02-17', 3006),
         

In [18]:
labeled_data.data.groupby(['region']).groups[3003]

MultiIndex([('2017-12-07', 3003),
            ('2017-12-08', 3003),
            ('2017-12-09', 3003),
            ('2017-12-10', 3003),
            ('2017-12-11', 3003),
            ('2017-12-12', 3003),
            ('2017-12-13', 3003),
            ('2017-12-14', 3003),
            ('2017-12-15', 3003),
            ('2017-12-16', 3003),
            ...
            ('2021-01-03', 3003),
            ('2021-01-04', 3003),
            ('2021-01-05', 3003),
            ('2021-01-06', 3003),
            ('2021-01-07', 3003),
            ('2021-01-08', 3003),
            ('2021-01-09', 3003),
            ('2021-01-10', 3003),
            ('2021-01-11', 3003),
            ('2021-01-12', 3003)],
           names=['date', 'region'], length=568)

In [19]:
labeled_data.data.groupby(['region']).groups[3003].get_level_values(0)

Index(['2017-12-07', '2017-12-08', '2017-12-09', '2017-12-10', '2017-12-11',
       '2017-12-12', '2017-12-13', '2017-12-14', '2017-12-15', '2017-12-16',
       ...
       '2021-01-03', '2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07',
       '2021-01-08', '2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12'],
      dtype='object', name='date', length=568)

In [20]:
labeled_data.data.groupby(['region']).groups[3003]

MultiIndex([('2017-12-07', 3003),
            ('2017-12-08', 3003),
            ('2017-12-09', 3003),
            ('2017-12-10', 3003),
            ('2017-12-11', 3003),
            ('2017-12-12', 3003),
            ('2017-12-13', 3003),
            ('2017-12-14', 3003),
            ('2017-12-15', 3003),
            ('2017-12-16', 3003),
            ...
            ('2021-01-03', 3003),
            ('2021-01-04', 3003),
            ('2021-01-05', 3003),
            ('2021-01-06', 3003),
            ('2021-01-07', 3003),
            ('2021-01-08', 3003),
            ('2021-01-09', 3003),
            ('2021-01-10', 3003),
            ('2021-01-11', 3003),
            ('2021-01-12', 3003)],
           names=['date', 'region'], length=568)

In [21]:
labeled_data.data.groupby(['region']).groups[3003].loc['2017-12-07']

AttributeError: 'MultiIndex' object has no attribute 'loc'

**The above seems to be working now, although I don't see any of the other column names. Has the other data now disappeared because I'm not somehow aggregating over the new groups (.sum(), .mean(), etc.)? Also, I can't seem to be able to access this new Multiindex object.**

# Make plots

I may need to normalize the input variables before I make the plots.