In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from avaml import download
from avaml import Error, varsomdata, setenvironment as se, _NONE, CSV_VERSION, REGIONS, merge
from avaml.get_text_data import TextDataset
from avaml.aggregatedata import ForecastDataset, LabeledData, REG_ENG, CsvMissingError

Varsom data has class in API, class Avalanche Warning, main message is text
https://github.com/NVE/varsomdata/blob/471a0e368e3f69352f27d73cb19ce8efa82d9467/varsomdata/getforecastapi.py

In [2]:
model_prefix = ''
days = 1
regobs_types = list(REG_ENG.keys())
text_data = TextDataset(regobs_types=regobs_types).label(days, with_varsom=True)

Fetching online data. (This may take a long time.)
    Getting data for season: 2017-18
    Getting data for season: 2018-19
    Getting data for season: 2019-20
Done!

Creating labeled dataset.
Done!


In [4]:
# first, drop regions
text_data = text_data.drop_regions()

# then, rename indices
text_data.data = text_data.data.rename_axis(['date','region'])
text_data.label = text_data.label.rename_axis(['date', 'region'])
text_data.main_text = text_data.main_text.rename_axis(['date', 'region'])

# flatten the hierchy of columns to 1D
text_data.data.columns = [' '.join(col).strip().replace(' ', '_') for col in text_data.data.columns.values]
text_data.label.columns = [' '.join(col).strip().replace(' ', '_') for col in text_data.label.columns.values]

# replace double underscores with single underscores
text_data.data.columns = [col.replace('__', '_') for col in text_data.data.columns.values]
text_data.label.columns = [col.replace('__', '_') for col in text_data.label.columns.values]

# convert some columns in labels to type int for averaging
text_data.label['CLASS_problem_amount'] = text_data.label['CLASS_problem_amount'].astype(int)
text_data.label['CLASS_danger_level'] = text_data.label['CLASS_danger_level'].astype(int)

for column in text_data.label.columns:
    if column.endswith(('_dist', '_dsize', '_lev_fill', '_prob', '_trig')):
        text_data.label[column] = text_data.label[column].astype(int)
        
# below, we can try to make categorical variables in the labels numeric
# first for the emergency warning column
warning_dict = {'Ikke gitt':0,
                'Naturlig utløste skred':1}

text_data.label['CLASS_emergency_warning'] = text_data.label['CLASS_emergency_warning'].replace(warning_dict)

# and now for the class problems
problem1 = list(np.unique(text_data.label.loc[:, 'CLASS_problem_1'].values))
problem2 = list(np.unique(text_data.label.loc[:, 'CLASS_problem_2'].values))
problem3 = list(np.unique(text_data.label.loc[:, 'CLASS_problem_3'].values))

list_of_problems = sorted(list(np.unique(problem1 + problem2 + problem3)))
problems_dict = {'':0, 'drift-slab':1, 'glide':2, 'new-loose':3,
                 'new-slab':4, 'pwl-slab':5, 'wet-loose':6, 'wet-slab':7}

text_data.label['CLASS_problem_1'] = text_data.label['CLASS_problem_1'].replace(problems_dict)
text_data.label['CLASS_problem_2'] = text_data.label['CLASS_problem_2'].replace(problems_dict)
text_data.label['CLASS_problem_3'] = text_data.label['CLASS_problem_3'].replace(problems_dict)

text_data.data = text_data.data.reorder_levels([1, 0])
text_data.label = text_data.data.reorder_levels([1, 0])
text_data.main_text = text_data.main_text.reorder_levels([1, 0])

In [5]:
text_data.data

Unnamed: 0_level_0,Unnamed: 1_level_0,danger_level_1,emergency_warning_1,problem_amount_1,problem_new-loose_1,problem_wet-loose_1,problem_new-slab_1,problem_drift-slab_1,problem_pwl-slab_1,problem_wet-slab_1,problem_glide_1,...,wind_chg_dir_W_0,wind_chg_dir_NW_0,wind_chg_start_0_0,wind_chg_start_6_0,wind_chg_start_12_0,wind_chg_start_18_0,temp_fl_start_0_0,temp_fl_start_6_0,temp_fl_start_12_0,temp_fl_start_18_0
region,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3031,2017-11-23,4.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3034,2017-11-23,4.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3003,2017-12-02,1.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3007,2017-12-02,2.0,1.0,2.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3009,2017-12-02,2.0,1.0,2.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3031,2021-01-12,3.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3032,2021-01-12,3.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3034,2021-01-12,3.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3035,2021-01-12,3.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
text_data.main_text

Unnamed: 0_level_0,Unnamed: 1_level_0,main_text
region,date,Unnamed: 2_level_1
3031,2017-11-22,Kraftig vindøkning og mye nedbør vil føre til...
3034,2017-11-22,Kraftig vindøkning og mye nedbør vil føre til...
3031,2017-11-23,Kraftig vind og mye nedbør vil føre til stor ...
3034,2017-11-23,Kraftig vind og mye nedbør vil føre til stor ...
3007,2017-11-29,Polart lavtrykk gir kraftig vind og lokalt sto...
...,...,...
3031,2021-01-12,Vinden snur og skred kan løses ut i alle himme...
3032,2021-01-12,Vinden snur og det kan løses ut skred i alle h...
3034,2021-01-12,Vinden snur og skred kan løses ut i alle himme...
3035,2021-01-12,Vinden snur og det kan løses ut skred i alle h...
