In [1]:
# installing Python libraries for dealing with NetCDF files
!pip install netCDF4
!pip install xarray



In [2]:
# import of standard Python libraries for data analysis
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# import of libraries for handling netCDF files and file paths
import netCDF4 as nc
from netCDF4 import Dataset
import glob
import xarray as xr
import pathlib

In [3]:
# I would like to see all columns of dataframes
pd.set_option('display.max_columns', None)

In [4]:
# for loop to create list of all files where snow data are stored
all_snow_files =[]
for file in glob.glob(".\\snow_vars\\*PRO*.nc"):
    all_snow_files.append(file)

In [5]:
# display the list of the files 
all_snow_files

['.\\snow_vars\\PRO_2010080106_2011080106.nc',
 '.\\snow_vars\\PRO_2011080106_2012080106.nc',
 '.\\snow_vars\\PRO_2012080106_2013080106.nc',
 '.\\snow_vars\\PRO_2013080106_2014080106.nc',
 '.\\snow_vars\\PRO_2014080106_2015080106.nc',
 '.\\snow_vars\\PRO_2015080106_2016080106.nc',
 '.\\snow_vars\\PRO_2016080106_2017080106.nc',
 '.\\snow_vars\\PRO_2017080106_2018080106.nc',
 '.\\snow_vars\\PRO_2018080106_2019080106.nc']

In [6]:
# store path to nc files in variable
snow_files_path = pathlib.Path(".\\snow_vars\\")

In [7]:
# display variable with path stored
snow_files_path

WindowsPath('snow_vars')

In [8]:
# create variable with all paths for all nc files
all_snow_file_paths = [str(snow_file_path) for snow_file_path in list(snow_files_path.glob("*PRO*.nc"))]
all_snow_file_paths

['snow_vars\\PRO_2010080106_2011080106.nc',
 'snow_vars\\PRO_2011080106_2012080106.nc',
 'snow_vars\\PRO_2012080106_2013080106.nc',
 'snow_vars\\PRO_2013080106_2014080106.nc',
 'snow_vars\\PRO_2014080106_2015080106.nc',
 'snow_vars\\PRO_2015080106_2016080106.nc',
 'snow_vars\\PRO_2016080106_2017080106.nc',
 'snow_vars\\PRO_2017080106_2018080106.nc',
 'snow_vars\\PRO_2018080106_2019080106.nc']

In [9]:
# display data from first nc file
snow_dataset = nc.Dataset(all_snow_file_paths[0])

In [10]:
# display general info about first nc dataset
snow_dataset.__dict__

{'title': 'S2M reanalysis: snow variables',
 'summary': 'This file takes part from a 60-years reanalysis of meteorological and snow conditions in the French Alps, Pyrenees and Corsica from 1958 to 2018. The simulations are performed over relatively homogeneous units designed to represent the main drivers of the spatial variability observed in mountain ranges (elevation, slope and aspect). The meteorological reanalysis is performed by the SAFRAN system, which adjusts a guess from a Numerical Weather Prediction model (ERA-40 reanalysis from 1958 to 2002, ARPEGE from 2002 to 2018) with the best possible set of available in-situ meteorological observations. SAFRAN outputs are used to force the Crocus detailed snowpack model within the land surface scheme SURFEX/ISBA. This provides the evolution of the snowpack and the associated avalanche hazard accounting for the main physical processes involved in a multilayer snowpack. This yearly file provides the snowpack properties of the Crocus mode

In [11]:
# checking variables in first nc dataset
snow_dataset.variables

{'Projection_Type': <class 'netCDF4._netCDF4.Variable'>
 int32 Projection_Type()
     grid_mapping_name: latitude_longitude
     earth_radius: 6371229.0
 unlimited dimensions: 
 current shape = ()
 filling on, default _FillValue of -2147483647 used,
 'ZS': <class 'netCDF4._netCDF4.Variable'>
 float64 ZS(Number_of_points)
     _FillValue: 1e+20
     long_name: elevation
     units: 
     standard_name: surface_altitude
 unlimited dimensions: 
 current shape = (263,)
 filling on,
 'aspect': <class 'netCDF4._netCDF4.Variable'>
 float64 aspect(Number_of_points)
     _FillValue: 1e+20
     long_name: aspect
     units: 
 unlimited dimensions: 
 current shape = (263,)
 filling on,
 'slope': <class 'netCDF4._netCDF4.Variable'>
 float64 slope(Number_of_points)
     _FillValue: 1e+20
     long_name: slope
     units: 
 unlimited dimensions: 
 current shape = (263,)
 filling on,
 'massif_num': <class 'netCDF4._netCDF4.Variable'>
 int32 massif_num(Number_of_points)
     _FillValue: 1000000000
   

In [12]:
# for loop to create final dataframe with data from all years (2010-2019)
snow_frame = []

for snow_f_path in all_snow_file_paths:
    ds_snow = xr.open_dataset(snow_f_path)   
    snow_frame.append(ds_snow.to_dataframe())
    
snow_final = pd.concat(snow_frame)
snow_final

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Projection_Type,ZS,aspect,slope,massif_num,longitude,latitude,TG1,TG4,WG1,WGI1,NAT_LEV,AVA_TYP,TALB_ISBA,RN_ISBA,H_ISBA,LE_ISBA,GFLUX_ISBA,EVAP_ISBA,SWD_ISBA,SWU_ISBA,LWD_ISBA,LWU_ISBA,DRAIN_ISBA,RUNOFF_ISBA,SNOMLT_ISBA,RAINF_ISBA,TS_ISBA,WSN_T_ISBA,DSN_T_ISBA,SD_1DY_ISBA,SD_3DY_ISBA,SD_5DY_ISBA,SD_7DY_ISBA,SWE_1DY_ISBA,SWE_3DY_ISBA,SWE_5DY_ISBA,SWE_7DY_ISBA,RAMSOND_ISBA,WET_TH_ISBA,REFRZTH_ISBA,DEP_HIG,DEP_MOD,ACC_LEV
Number_of_Patches,Number_of_points,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
0,0,2010-08-02 06:00:00,-2147483647,300.0,-1.0,0.0,1.0,6.64493,46.17685,290.529120,291.307455,0.300823,0.0,6.0,6.0,0.2,101.399256,7.965518,78.190870,15.242868,0.000031,227.403107,45.480621,348.411993,428.935223,0.000014,0.000006,0.0,0.000047,290.529120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.0
0,0,2010-08-03 06:00:00,-2147483647,300.0,-1.0,0.0,1.0,6.64493,46.17685,289.453549,290.374879,0.321829,0.0,6.0,6.0,0.2,39.763221,7.802390,41.476192,-9.515362,0.000048,100.790266,20.158053,372.711457,413.580448,0.000014,0.000023,0.0,0.000161,289.453549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.0
0,0,2010-08-04 06:00:00,-2147483647,300.0,-1.0,0.0,1.0,6.64493,46.17685,287.871498,289.817265,0.294828,0.0,6.0,6.0,0.2,91.666793,13.500873,75.886092,2.279828,0.000078,216.241260,43.248252,338.344360,419.670574,0.000014,0.000000,0.0,0.000000,287.871498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.0
0,0,2010-08-05 06:00:00,-2147483647,300.0,-1.0,0.0,1.0,6.64493,46.17685,288.033414,289.782613,0.327341,0.0,6.0,6.0,0.2,83.623522,10.688451,73.006614,-0.071543,0.000107,207.482878,41.496576,332.339757,414.702537,0.000014,0.000011,0.0,0.000073,288.033414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.0
0,0,2010-08-06 06:00:00,-2147483647,300.0,-1.0,0.0,1.0,6.64493,46.17685,286.046128,287.305681,0.328986,0.0,6.0,6.0,0.2,10.569247,5.531032,40.679780,-35.641566,0.000124,87.137610,17.427522,329.720714,388.861555,0.000014,0.000040,0.0,0.000222,286.046128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.0
0,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,262,2019-07-28 06:00:00,-2147483647,3000.0,-1.0,0.0,30.0,7.30250,46.39000,277.699656,279.459104,0.347131,0.0,6.0,6.0,0.2,60.952727,44.978618,39.765394,-23.791284,0.003366,121.981213,24.396243,317.183510,353.815753,0.000073,0.000031,0.0,0.000141,277.699656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.0
0,262,2019-07-29 06:00:00,-2147483647,3000.0,-1.0,0.0,30.0,7.30250,46.39000,277.287258,277.232167,0.324825,0.0,6.0,6.0,0.2,13.271584,20.579807,29.995796,-37.304019,0.003378,48.173300,9.634660,309.879349,335.146405,0.000071,0.000021,0.0,0.000093,277.287258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.0
0,262,2019-07-30 06:00:00,-2147483647,3000.0,-1.0,0.0,30.0,7.30250,46.39000,279.483238,279.907868,0.312860,0.0,6.0,6.0,0.2,141.494131,44.852793,70.593926,26.047412,0.003406,252.734246,50.546849,302.462307,363.155572,0.000068,0.000000,0.0,0.000002,279.483238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.0
0,262,2019-07-31 06:00:00,-2147483647,3000.0,-1.0,0.0,30.0,7.30250,46.39000,277.792313,279.773442,0.305518,0.0,6.0,6.0,0.2,183.244835,88.619458,79.311102,15.314275,0.003438,290.894194,58.178839,323.500583,372.971103,0.000066,0.000000,0.0,0.000000,277.792313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.0


In [13]:
# checking names of the variables
snow_final.columns

Index(['Projection_Type', 'ZS', 'aspect', 'slope', 'massif_num', 'longitude',
       'latitude', 'TG1', 'TG4', 'WG1', 'WGI1', 'NAT_LEV', 'AVA_TYP',
       'TALB_ISBA', 'RN_ISBA', 'H_ISBA', 'LE_ISBA', 'GFLUX_ISBA', 'EVAP_ISBA',
       'SWD_ISBA', 'SWU_ISBA', 'LWD_ISBA', 'LWU_ISBA', 'DRAIN_ISBA',
       'RUNOFF_ISBA', 'SNOMLT_ISBA', 'RAINF_ISBA', 'TS_ISBA', 'WSN_T_ISBA',
       'DSN_T_ISBA', 'SD_1DY_ISBA', 'SD_3DY_ISBA', 'SD_5DY_ISBA',
       'SD_7DY_ISBA', 'SWE_1DY_ISBA', 'SWE_3DY_ISBA', 'SWE_5DY_ISBA',
       'SWE_7DY_ISBA', 'RAMSOND_ISBA', 'WET_TH_ISBA', 'REFRZTH_ISBA',
       'DEP_HIG', 'DEP_MOD', 'ACC_LEV'],
      dtype='object')

In [14]:
# removing variables that are not important for avalanche prediction (at least from in my opinion)
snow_final = snow_final.drop(columns=['Projection_Type', 'aspect', 'slope', 'H_ISBA', 'LE_ISBA', 
               'GFLUX_ISBA', 'EVAP_ISBA', 'SWD_ISBA', 'SWU_ISBA', 'LWD_ISBA', 'LWU_ISBA', 'DEP_HIG', 'DEP_MOD'])

In [15]:
# checking variables that I decided to keep
snow_final.columns

Index(['ZS', 'massif_num', 'longitude', 'latitude', 'TG1', 'TG4', 'WG1',
       'WGI1', 'NAT_LEV', 'AVA_TYP', 'TALB_ISBA', 'RN_ISBA', 'DRAIN_ISBA',
       'RUNOFF_ISBA', 'SNOMLT_ISBA', 'RAINF_ISBA', 'TS_ISBA', 'WSN_T_ISBA',
       'DSN_T_ISBA', 'SD_1DY_ISBA', 'SD_3DY_ISBA', 'SD_5DY_ISBA',
       'SD_7DY_ISBA', 'SWE_1DY_ISBA', 'SWE_3DY_ISBA', 'SWE_5DY_ISBA',
       'SWE_7DY_ISBA', 'RAMSOND_ISBA', 'WET_TH_ISBA', 'REFRZTH_ISBA',
       'ACC_LEV'],
      dtype='object')

In [16]:
# renaming columns to more clear names
snow_final.columns = ["elevation","massif_num","lon", "lat", "temp_soil_0.005_m", "temp_soil_0.08_m",
                   "liquid_water_in_soil", "frozen_water_in_soil", "risk_index", "aval_type",
                   "whiteness_albedo", "net_radiation", "drainage", "runoff", "snow_melting_rate",
                   "rainfall_rate", "surface_temperature", "surface_snow_amount", "thickness_of_snowfall",
                   "snow_thickness_1D", "snow_thickness_3D", "snow_thickness_5D", "snow_thickness_7D",
                   "snow_water_1D", "snow_water_3D", "snow_water_5D", "snow_water_7D",
                   "penetration_ram_resistance", "thickness_of_wet_snow_top_of_snowpack", 
                    "thickness_of_frozen_snow_top_of_snowpack", "acccidental_risk_index"]

In [17]:
# display general info about dataset
snow_final.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 864481 entries, (0, 0, Timestamp('2010-08-02 06:00:00')) to (0, 262, Timestamp('2019-08-01 06:00:00'))
Data columns (total 31 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   elevation                                 864481 non-null  float64
 1   massif_num                                864481 non-null  float64
 2   lon                                       864481 non-null  float64
 3   lat                                       864481 non-null  float64
 4   temp_soil_0.005_m                         864481 non-null  float64
 5   temp_soil_0.08_m                          864481 non-null  float64
 6   liquid_water_in_soil                      864481 non-null  float64
 7   frozen_water_in_soil                      864481 non-null  float64
 8   risk_index                                864481 non-null  float64
 9   aval_type        

In [18]:
# size of dataset
snow_final.shape

(864481, 31)

In [19]:
# checking number of massifs
snow_final.massif_num.unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 30.])

In [20]:
# massif num 30 seemed odd because there should be just 23 massifs in Alps
snow_final[snow_final["massif_num"]==30.0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,elevation,massif_num,lon,lat,temp_soil_0.005_m,temp_soil_0.08_m,liquid_water_in_soil,frozen_water_in_soil,risk_index,aval_type,whiteness_albedo,net_radiation,drainage,runoff,snow_melting_rate,rainfall_rate,surface_temperature,surface_snow_amount,thickness_of_snowfall,snow_thickness_1D,snow_thickness_3D,snow_thickness_5D,snow_thickness_7D,snow_water_1D,snow_water_3D,snow_water_5D,snow_water_7D,penetration_ram_resistance,thickness_of_wet_snow_top_of_snowpack,thickness_of_frozen_snow_top_of_snowpack,acccidental_risk_index
Number_of_Patches,Number_of_points,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
0,253,2010-08-02 06:00:00,300.0,30.0,7.3025,46.39,290.366625,290.960193,0.324170,0.0,6.0,6.0,0.2,80.405456,0.000015,0.000015,0.0,0.000099,290.366625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
0,253,2010-08-03 06:00:00,300.0,30.0,7.3025,46.39,289.450542,290.372597,0.310142,0.0,6.0,6.0,0.2,15.361571,0.000015,0.000015,0.0,0.000087,289.450542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
0,253,2010-08-04 06:00:00,300.0,30.0,7.3025,46.39,288.189642,289.549208,0.301689,0.0,6.0,6.0,0.2,14.628353,0.000015,0.000000,0.0,0.000001,288.189642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
0,253,2010-08-05 06:00:00,300.0,30.0,7.3025,46.39,287.724849,289.112728,0.322362,0.0,6.0,6.0,0.2,27.349739,0.000015,0.000008,0.0,0.000049,287.724849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
0,253,2010-08-06 06:00:00,300.0,30.0,7.3025,46.39,286.255982,287.263701,0.328037,0.0,6.0,6.0,0.2,-10.399188,0.000015,0.000045,0.0,0.000217,286.255982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
0,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,262,2019-07-28 06:00:00,3000.0,30.0,7.3025,46.39,277.699656,279.459104,0.347131,0.0,6.0,6.0,0.2,60.952727,0.000073,0.000031,0.0,0.000141,277.699656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
0,262,2019-07-29 06:00:00,3000.0,30.0,7.3025,46.39,277.287258,277.232167,0.324825,0.0,6.0,6.0,0.2,13.271584,0.000071,0.000021,0.0,0.000093,277.287258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
0,262,2019-07-30 06:00:00,3000.0,30.0,7.3025,46.39,279.483238,279.907868,0.312860,0.0,6.0,6.0,0.2,141.494131,0.000068,0.000000,0.0,0.000002,279.483238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
0,262,2019-07-31 06:00:00,3000.0,30.0,7.3025,46.39,277.792313,279.773442,0.305518,0.0,6.0,6.0,0.2,183.244835,0.000066,0.000000,0.0,0.000000,277.792313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [21]:
# massif with num 30 is most likely mistake, because its latitude and longitude is outside France
# I decided to remove massif 30 from dataset
snow_final = snow_final[snow_final["massif_num"]<30.0]
snow_final.massif_num.nunique()

23

In [22]:
# checking some info about elevation
snow_final.elevation.unique()

array([ 300.,  600.,  900., 1200., 1500., 1800., 2100., 2400., 2700.,
       3000., 3300., 3600., 3900., 4200., 4500., 4800.,    0.])

In [23]:
# display index of dataset
snow_final.index

MultiIndex([(0,   0, '2010-08-02 06:00:00'),
            (0,   0, '2010-08-03 06:00:00'),
            (0,   0, '2010-08-04 06:00:00'),
            (0,   0, '2010-08-05 06:00:00'),
            (0,   0, '2010-08-06 06:00:00'),
            (0,   0, '2010-08-07 06:00:00'),
            (0,   0, '2010-08-08 06:00:00'),
            (0,   0, '2010-08-09 06:00:00'),
            (0,   0, '2010-08-10 06:00:00'),
            (0,   0, '2010-08-11 06:00:00'),
            ...
            (0, 252, '2019-07-23 06:00:00'),
            (0, 252, '2019-07-24 06:00:00'),
            (0, 252, '2019-07-25 06:00:00'),
            (0, 252, '2019-07-26 06:00:00'),
            (0, 252, '2019-07-27 06:00:00'),
            (0, 252, '2019-07-28 06:00:00'),
            (0, 252, '2019-07-29 06:00:00'),
            (0, 252, '2019-07-30 06:00:00'),
            (0, 252, '2019-07-31 06:00:00'),
            (0, 252, '2019-08-01 06:00:00')],
           names=['Number_of_Patches', 'Number_of_points', 'time'], length=831611)

In [24]:
# actually Number of patches and Number of points is not interesting for my analysis
# therefore I decided to remove this info from my dataset
snow_final.index = snow_final.index.droplevel(level = (0, 1))

In [25]:
# checking size of dataset after changes
snow_final.shape

(831611, 31)

In [26]:
# verifying changes to index of dataset
snow_final.head()

Unnamed: 0_level_0,elevation,massif_num,lon,lat,temp_soil_0.005_m,temp_soil_0.08_m,liquid_water_in_soil,frozen_water_in_soil,risk_index,aval_type,whiteness_albedo,net_radiation,drainage,runoff,snow_melting_rate,rainfall_rate,surface_temperature,surface_snow_amount,thickness_of_snowfall,snow_thickness_1D,snow_thickness_3D,snow_thickness_5D,snow_thickness_7D,snow_water_1D,snow_water_3D,snow_water_5D,snow_water_7D,penetration_ram_resistance,thickness_of_wet_snow_top_of_snowpack,thickness_of_frozen_snow_top_of_snowpack,acccidental_risk_index
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2010-08-02 06:00:00,300.0,1.0,6.64493,46.17685,290.52912,291.307455,0.300823,0.0,6.0,6.0,0.2,101.399256,1.4e-05,6e-06,0.0,4.7e-05,290.52912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2010-08-03 06:00:00,300.0,1.0,6.64493,46.17685,289.453549,290.374879,0.321829,0.0,6.0,6.0,0.2,39.763221,1.4e-05,2.3e-05,0.0,0.000161,289.453549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2010-08-04 06:00:00,300.0,1.0,6.64493,46.17685,287.871498,289.817265,0.294828,0.0,6.0,6.0,0.2,91.666793,1.4e-05,0.0,0.0,0.0,287.871498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2010-08-05 06:00:00,300.0,1.0,6.64493,46.17685,288.033414,289.782613,0.327341,0.0,6.0,6.0,0.2,83.623522,1.4e-05,1.1e-05,0.0,7.3e-05,288.033414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2010-08-06 06:00:00,300.0,1.0,6.64493,46.17685,286.046128,287.305681,0.328986,0.0,6.0,6.0,0.2,10.569247,1.4e-05,4e-05,0.0,0.000222,286.046128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [27]:
# I decided to analyze only data from october 2010
# therefore data before october 2010 will be removed
snow_final = snow_final[snow_final.index>'2010-09-30 23:00:00']

In [28]:
# checking final size of dataset
snow_final.shape

(816431, 31)

In [29]:
# viewing general info about final dataset
snow_final.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 816431 entries, 2010-10-01 06:00:00 to 2019-08-01 06:00:00
Data columns (total 31 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   elevation                                 816431 non-null  float64
 1   massif_num                                816431 non-null  float64
 2   lon                                       816431 non-null  float64
 3   lat                                       816431 non-null  float64
 4   temp_soil_0.005_m                         816431 non-null  float64
 5   temp_soil_0.08_m                          816431 non-null  float64
 6   liquid_water_in_soil                      816431 non-null  float64
 7   frozen_water_in_soil                      816431 non-null  float64
 8   risk_index                                816431 non-null  float64
 9   aval_type                                 816431 non-null 

In [30]:
# checking vqrious elevation levels
snow_final.elevation.nunique()

17

In [31]:
# first glance at distribution of elevation values
snow_final.elevation.value_counts()

1800.0    74221
1500.0    74221
2100.0    74221
900.0     74221
1200.0    74221
2400.0    70994
3000.0    64540
2700.0    64540
600.0     61313
3300.0    51632
300.0     45178
3600.0    29043
0.0       22589
3900.0    19362
4200.0     9681
4500.0     3227
4800.0     3227
Name: elevation, dtype: int64

In [32]:
# reseting index of dataframe because we want variable time as column, not an index
snow_final = snow_final.reset_index()
snow_final

Unnamed: 0,time,elevation,massif_num,lon,lat,temp_soil_0.005_m,temp_soil_0.08_m,liquid_water_in_soil,frozen_water_in_soil,risk_index,aval_type,whiteness_albedo,net_radiation,drainage,runoff,snow_melting_rate,rainfall_rate,surface_temperature,surface_snow_amount,thickness_of_snowfall,snow_thickness_1D,snow_thickness_3D,snow_thickness_5D,snow_thickness_7D,snow_water_1D,snow_water_3D,snow_water_5D,snow_water_7D,penetration_ram_resistance,thickness_of_wet_snow_top_of_snowpack,thickness_of_frozen_snow_top_of_snowpack,acccidental_risk_index
0,2010-10-01 06:00:00,300.0,1.0,6.64493,46.17685,282.465062,283.764230,0.307530,0.0,6.0,6.0,0.2,-12.281614,0.000010,9.169673e-06,0.000000,0.000058,282.465062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
1,2010-10-02 06:00:00,300.0,1.0,6.64493,46.17685,282.951518,284.366491,0.296670,0.0,6.0,6.0,0.2,54.306786,0.000010,1.627019e-07,0.000000,0.000001,282.951518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,2010-10-03 06:00:00,300.0,1.0,6.64493,46.17685,282.337074,284.298989,0.292069,0.0,6.0,6.0,0.2,33.969771,0.000010,0.000000e+00,0.000000,0.000000,282.337074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
3,2010-10-04 06:00:00,300.0,1.0,6.64493,46.17685,283.759313,285.011972,0.287222,0.0,6.0,6.0,0.2,64.971723,0.000010,0.000000e+00,0.000000,0.000000,283.759313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
4,2010-10-05 06:00:00,300.0,1.0,6.64493,46.17685,285.355731,286.147787,0.303026,0.0,6.0,6.0,0.2,10.364757,0.000010,3.799026e-06,0.000000,0.000028,285.355731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816426,2019-07-28 06:00:00,3300.0,23.0,7.31586,44.12649,275.347518,277.206484,0.351028,0.0,6.0,6.0,0.2,110.571972,0.000081,1.023487e-04,0.000212,0.000264,275.347518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
816427,2019-07-29 06:00:00,3300.0,23.0,7.31586,44.12649,279.126466,279.156961,0.322740,0.0,6.0,6.0,0.2,110.320142,0.000078,0.000000e+00,0.000000,0.000003,279.126466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
816428,2019-07-30 06:00:00,3300.0,23.0,7.31586,44.12649,279.914350,279.818644,0.312964,0.0,6.0,6.0,0.2,163.534030,0.000075,0.000000e+00,0.000000,0.000000,279.914350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
816429,2019-07-31 06:00:00,3300.0,23.0,7.31586,44.12649,279.045489,279.909958,0.304790,0.0,6.0,6.0,0.2,223.477537,0.000072,0.000000e+00,0.000000,0.000000,279.045489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [33]:
# checking data types of variables in dataset after changes
snow_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816431 entries, 0 to 816430
Data columns (total 32 columns):
 #   Column                                    Non-Null Count   Dtype         
---  ------                                    --------------   -----         
 0   time                                      816431 non-null  datetime64[ns]
 1   elevation                                 816431 non-null  float64       
 2   massif_num                                816431 non-null  float64       
 3   lon                                       816431 non-null  float64       
 4   lat                                       816431 non-null  float64       
 5   temp_soil_0.005_m                         816431 non-null  float64       
 6   temp_soil_0.08_m                          816431 non-null  float64       
 7   liquid_water_in_soil                      816431 non-null  float64       
 8   frozen_water_in_soil                      816431 non-null  float64       
 9   risk_index     

In [34]:
# separating info about day from hours
snow_final["day"] = snow_final.time.dt.date

In [35]:
# veryfying the changes
snow_final.head()

Unnamed: 0,time,elevation,massif_num,lon,lat,temp_soil_0.005_m,temp_soil_0.08_m,liquid_water_in_soil,frozen_water_in_soil,risk_index,aval_type,whiteness_albedo,net_radiation,drainage,runoff,snow_melting_rate,rainfall_rate,surface_temperature,surface_snow_amount,thickness_of_snowfall,snow_thickness_1D,snow_thickness_3D,snow_thickness_5D,snow_thickness_7D,snow_water_1D,snow_water_3D,snow_water_5D,snow_water_7D,penetration_ram_resistance,thickness_of_wet_snow_top_of_snowpack,thickness_of_frozen_snow_top_of_snowpack,acccidental_risk_index,day
0,2010-10-01 06:00:00,300.0,1.0,6.64493,46.17685,282.465062,283.76423,0.30753,0.0,6.0,6.0,0.2,-12.281614,1e-05,9.169673e-06,0.0,5.8e-05,282.465062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010-10-01
1,2010-10-02 06:00:00,300.0,1.0,6.64493,46.17685,282.951518,284.366491,0.29667,0.0,6.0,6.0,0.2,54.306786,1e-05,1.627019e-07,0.0,1e-06,282.951518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010-10-02
2,2010-10-03 06:00:00,300.0,1.0,6.64493,46.17685,282.337074,284.298989,0.292069,0.0,6.0,6.0,0.2,33.969771,1e-05,0.0,0.0,0.0,282.337074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010-10-03
3,2010-10-04 06:00:00,300.0,1.0,6.64493,46.17685,283.759313,285.011972,0.287222,0.0,6.0,6.0,0.2,64.971723,1e-05,0.0,0.0,0.0,283.759313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010-10-04
4,2010-10-05 06:00:00,300.0,1.0,6.64493,46.17685,285.355731,286.147787,0.303026,0.0,6.0,6.0,0.2,10.364757,1e-05,3.799026e-06,0.0,2.8e-05,285.355731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010-10-05


In [36]:
# removing redundant variable time and checking of changes
snow_final = snow_final.drop(columns=['time'])
snow_final.head()

Unnamed: 0,elevation,massif_num,lon,lat,temp_soil_0.005_m,temp_soil_0.08_m,liquid_water_in_soil,frozen_water_in_soil,risk_index,aval_type,whiteness_albedo,net_radiation,drainage,runoff,snow_melting_rate,rainfall_rate,surface_temperature,surface_snow_amount,thickness_of_snowfall,snow_thickness_1D,snow_thickness_3D,snow_thickness_5D,snow_thickness_7D,snow_water_1D,snow_water_3D,snow_water_5D,snow_water_7D,penetration_ram_resistance,thickness_of_wet_snow_top_of_snowpack,thickness_of_frozen_snow_top_of_snowpack,acccidental_risk_index,day
0,300.0,1.0,6.64493,46.17685,282.465062,283.76423,0.30753,0.0,6.0,6.0,0.2,-12.281614,1e-05,9.169673e-06,0.0,5.8e-05,282.465062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010-10-01
1,300.0,1.0,6.64493,46.17685,282.951518,284.366491,0.29667,0.0,6.0,6.0,0.2,54.306786,1e-05,1.627019e-07,0.0,1e-06,282.951518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010-10-02
2,300.0,1.0,6.64493,46.17685,282.337074,284.298989,0.292069,0.0,6.0,6.0,0.2,33.969771,1e-05,0.0,0.0,0.0,282.337074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010-10-03
3,300.0,1.0,6.64493,46.17685,283.759313,285.011972,0.287222,0.0,6.0,6.0,0.2,64.971723,1e-05,0.0,0.0,0.0,283.759313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010-10-04
4,300.0,1.0,6.64493,46.17685,285.355731,286.147787,0.303026,0.0,6.0,6.0,0.2,10.364757,1e-05,3.799026e-06,0.0,2.8e-05,285.355731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010-10-05


In [37]:
# saving dataframe with 31 snow variables to new csv file for further use
snow_final = snow_final.to_csv(r'.\\snow_final.csv', index = False)