# Raw Data visualisation and analysis

This notebook was designed to carry out the visualisation and analysis of the raw data

---

 - Author:          
                    Luis F Patino Velasquez - MA
 - Date:            
                    Jun 2020
 - Version:         
                    1.0
 - Notes:            
                    Files used in this notebook are in netCDF format
 - Jupyter version: 
                    jupyter core     : 4.7.1
                    jupyter-notebook : 6.4.0
                    qtconsole        : 5.1.1
                    ipython          : 7.25.0
                    ipykernel        : 6.0.3
                    jupyter client   : 6.1.12
                    jupyter lab      : 3.0.16
                    nbconvert        : 6.1.0
                    ipywidgets       : 7.6.3
                    nbformat         : 5.1.3
                    traitlets        : 5.0.5
 - Python version:  
                    3.8.5 

---

### Setting Python Modules

In [None]:
# Imports for xclim and xarray
import xclim as xc
import pandas as pd
import numpy as np
import xarray as xr
import functools
# from functools import reduce

# File handling libraries
import time
import tempfile
from pathlib import Path

# Geospatial libraries
import geopandas
import rioxarray
from shapely.geometry import mapping

# import plotting stuff
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.mlab as mlab
import seaborn as sns
# set colours
# plt.style.use('default')
plt.style.use("~/.local/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/lfpv.mplstyle")

%matplotlib inline
# Set some plotting defaults
plt.rcParams['figure.figsize'] = (15, 11)
plt.rcParams['figure.dpi'] = 50

# Mapping libraries
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

fldr_images = Path('/mnt/d/MRes_dataset/Images/Others')

sep = '-----------\n-----------'
print(sep)

In [None]:
def UK_clip(xarray_dataset, coord_lon_name, coord_lat_name, xarray_dataset_crs):
    # Setting spatial dimmension in nc data
    xarray_dataset.rio.set_spatial_dims(x_dim=coord_lon_name, y_dim=coord_lat_name, inplace=True)
    xarray_dataset.rio.write_crs(xarray_dataset_crs, inplace=True)

    # Set mask based on boundary
    uk_admn = geopandas.read_file('/mnt/d/MRes_dataset/active_data/101_admin/uk_admin_boundary_py_nasa_pp_countryOutlineFromGiovanni.shp', crs="epsg:4326")
    # Data for UK
    uk_clipData = xarray_dataset.rio.clip(uk_admn.geometry.apply(mapping), uk_admn.crs, drop=False)
    
    return(uk_clipData)

### 1. Reading the raw data

#### 1.1. ERA5

In [None]:
# Set directory to read and for outputs
fldr_src = Path('/mnt/d/MRes_dataset/search_data/era_copernicus_uk/')

# Create list with files
fls_lst = fldr_src.glob('**/era5_copernicus_DAY_prcp_*')

# Load multiple NetCDFs into a single xarray.Dataset
dataset_ERA = xr.open_mfdataset(paths=fls_lst, combine='by_coords', parallel=True)
dataset_ERA

#### 1.2. GPM-IMERG

In [None]:
# Set directory to read and for outputs
fldr_src = Path('/mnt/d/MRes_dataset/search_data/gpm_imerg_nasa_uk/')

# Create list with files
fls_lst = fldr_src.glob('**/*')


# Load multiple NetCDFs into a single xarray.Dataset
dataset_GPM = xr.open_mfdataset(paths=fls_lst, combine='by_coords', parallel=True)
dataset_GPM

#### 1.3. HadUK-Grid

In [None]:
# Set directory to read and for outputs
fldr_src = Path('/mnt/d/MRes_dataset/search_data/haduk_cedac_uk/')

# Create list with files
fls_lst = fldr_src.glob('**/*')


# Load multiple NetCDFs into a single xarray.Dataset
dataset_HAD = xr.open_mfdataset(paths=fls_lst, combine='by_coords', parallel=True)
dataset_HAD

### 2. Data Analysis

#### 2.1. Functions

In [None]:
def UK_clip(xarray_dataset, coord_lon_name, coord_lat_name, xarray_dataset_crs):
    """
    Return xarray with data for the UK only
    :xarray_dataset: xarray
    :coord_lon_name: string
    :coord_lat_name: string
    :xarray_dataset_crs: dictionary
    :return: xarray
    """
    # Setting spatial dimmension in nc data
    xarray_dataset.rio.set_spatial_dims(x_dim=coord_lon_name, y_dim=coord_lat_name, inplace=True)
    xarray_dataset.rio.write_crs(xarray_dataset_crs, inplace=True)

    # Set mask based on boundary
    uk_admn = geopandas.read_file('/mnt/d/MRes_dataset/active_data/101_admin/uk_admin_boundary_py_nasa_pp_countryOutlineFromGiovanni.shp', crs="epsg:4326")
    # Data for UK
    uk_clipData = xarray_dataset.rio.clip(uk_admn.geometry.apply(mapping), uk_admn.crs, drop=False)
    
    return(uk_clipData)

def plot_setup(subplot_ref, data_source1, data_source2):
    """
    Return mapplotlib figure
    :subplot_ref: list of integers
    :data_source1: string
    :data_source2: string
    :return: mapplotlib figure
    """
    # x-axis labels
    subplot_ref.grid(b=True, which='major', color='grey', linestyle='-', alpha=0.3)
    subplot_ref.set_xticks(x)
    subplot_ref.set_xticklabels([*range(2001,2020,1)])
    
    # Set the tick positions
    subplot_ref.set_xticks(x)
    # Set the tick labels
    subplot_ref.xaxis.set_tick_params(labelsize='x-large')
    subplot_ref.yaxis.set_tick_params(labelsize='x-large')
    # Set title and axis
    subplot_ref.grid(b=True, which='major', color='grey', linestyle='-', alpha=0.3)
    subplot_ref.set_ylabel('Precipitation (mm)', fontdict={'fontsize': 20, 'fontweight': 'normal'})
    subplot_ref.set_xlabel('Years', fontdict={'fontsize': 20, 'fontweight': 'normal'})
    # Set text
    subplot_ref.text(0.95, 0.95, 'HadUK-Grid', horizontalalignment='center', verticalalignment='top',\
                  transform=subplot_ref.transAxes, fontsize='x-large', fontweight='bold',\
                  bbox=dict(facecolor='none', edgecolor='#a65628', boxstyle='round', linewidth=5.0))
    if data_source2 == 'ERA':
        subplot_ref.text(0.95, 0.92, '     ERA5     ', horizontalalignment='center', verticalalignment='top',\
                      transform=subplot_ref.transAxes, fontsize='x-large', fontweight='bold',\
                      bbox=dict(facecolor='none', edgecolor='#377eb8', boxstyle='round', linewidth=5.0))
    else:
        subplot_ref.text(0.95, 0.92, 'GPM-IMERG', horizontalalignment='center', verticalalignment='top',\
                      transform=subplot_ref.transAxes, fontsize='x-large', fontweight='bold',\
                      bbox=dict(facecolor='none', edgecolor='#4daf4a', boxstyle='round', linewidth=5.0))

def violin_clr(figure, colour):
    for vp in figure['bodies']:
        vp.set_facecolor(colour)
    for partname in ('cbars','cmins','cmaxes','cmeans'):
        vp = figure[partname]
        vp.set_edgecolor(colour)
        vp.set_linewidth(1)

def saving_image(subplot_ref, fldr_plot, file_name):
    """
    Save image output in folder
    :subplot_ref: list of integers
    :fldr_plot: pathlib folder path
    :file_name: string
    """
    extent = subplot_ref.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
    fig.savefig((Path(fldr_plot / file_name)), bbox_inches=extent)
    # Pad the saved area by 10% in the x-direction and 20% in the y-direction
    fig.savefig((Path(fldr_plot / file_name)), bbox_inches=extent.expanded(1.1, 1.2))

#### 2.2.  Yearly Average Analysis
Here we are plotting the mean yearly value for each of the datasets for the whole UK

In [None]:
# Get annual value from daily data
arr_yearPrcp_ERA = dataset_ERA.groupby('time.year').sum(dim='time')
arr_yearPrcp_GPM = dataset_GPM.groupby('time.year').sum(dim='time')
arr_yearPrcp_HAD = dataset_HAD.groupby('time.year').sum(dim='time')

# only use mainland UK data
arr_yearPrcp_ERAUK = UK_clip(arr_yearPrcp_ERA, 'longitude', 'latitude', "epsg:4326")
arr_yearPrcp_GPMUK = UK_clip(arr_yearPrcp_GPM, 'lon', 'lat', "epsg:4326")

# Convert data to pandas dataframe
df_yearPrcp_ERA = arr_yearPrcp_ERA.to_dataframe().reset_index()
df_yearPrcp_GPM = arr_yearPrcp_GPM.to_dataframe().reset_index()
df_yearPrcp_HAD = arr_yearPrcp_HAD.to_dataframe().reset_index()

####################################################
#I NEED TO ADD THE FUNCTION THAT JOINS THE DATAFRAMES
#####################################################

# For HADGrid-UK replace zero for NaN to avoid using zero in the mean value
df_yearPrcp_HAD = df_yearPrcp_HAD.replace(0, np.NaN)
df_yearPrcp_ERA


# Get the mean yearly value new = df_yearPrcp_ERA.groupby(['year']).agg({'tp': ['mean']}).reset_index()
df_MeanyearPrcp_ERA = df_yearPrcp_ERA.groupby('year', as_index=False)['tp'].mean()
df_MeanyearPrcp_GPM = df_yearPrcp_GPM.groupby('year', as_index=False)['precipitationCal'].mean()
df_MeanyearPrcp_HAD = df_yearPrcp_HAD.groupby('year', as_index=False)['rainfall'].mean()

# create dataframe with mean yearly value
dfs_lst = [df_MeanyearPrcp_ERA, df_MeanyearPrcp_GPM, df_MeanyearPrcp_HAD]
df_final = functools.reduce(lambda left,right: pd.merge(left,right,on='year'), dfs_lst)
df_final

* **Plotting the yearly average for the UK using all datasets**

In [None]:
# Create copy of dataframe
df_plot = df_final


# Rename columns
df_plot.rename(columns = {'tp':'prcp_ERA5', 'precipitationCal':'prcp_IMERG',
                              'rainfall':'prcp_HadGrid-UK'}, inplace = True)

# change year column to date format
df_plot['year'] = pd.to_datetime(df_plot['year'], format='%Y')

# Plot data
ERA = df_plot['prcp_ERA5'].tolist()
GPM = df_plot['prcp_IMERG'].tolist()
HAD = df_plot['prcp_HadGrid-UK'].tolist()
yrs = df_plot['year'].tolist()

# Create plot
fig, axs = plt.subplots(figsize=(15, 11))
axs.plot(yrs, ERA, label = 'prcp ERA5', marker='D')
axs.plot(yrs, GPM, label = 'prcp GPM-IMERG', marker='v')
axs.plot(yrs, HAD, label = 'prcp HadGrid-UK', marker='o')

axs.xaxis.set_tick_params(labelsize='large')
axs.yaxis.set_tick_params(labelsize='large')
# Set title and axis
axs.grid(b=True, which='major', color='grey', linestyle='-', alpha=0.3)
axs.set_ylabel('precipitation (mm)', fontdict={'fontsize': 18, 'fontweight': 'normal'})
axs.set_xlabel('years', fontdict={'fontsize': 18, 'fontweight': 'normal'})
# Set legend
axs.legend(bbox_to_anchor=(0, 1, 1, 0), loc='best', fontsize='large', ncol=3)

* **Creating climatology map for all datasets**

In [None]:
# Summ data by year
year_dataset = dataset_GPM.groupby('time.year').sum(dim='time')
# year_dataset_climat = UK_clip(year_dataset, 'longitude', 'latitude', "epsg:4326")
# year_dataset_climat = dataset_HAD.groupby('time.year').sum(dim='time')
# Change to pandas dataframe
df = year_dataset.to_dataframe().reset_index()
# Group by coordinate and average
grouped_df=df.groupby(['latitude','longitude']).mean()
grouped_df1 = grouped_df.reset_index()
grouped_prcp = grouped_df1.drop(['year'], axis = 1)

# Pivot dataframe ready for the plot
val_pivot_df = grouped_prcp.pivot(index='latitude', columns='longitude', values='tp')

      
# Plot
from mpl_toolkits.axes_grid1 import make_axes_locatable
fig, axs = plt.subplots(figsize=(8,15))
mm = Basemap(resolution='i',projection='merc',ellps='WGS84',llcrnrlat=49,urcrnrlat=61,llcrnrlon=-9,urcrnrlon=2,lat_ts=20,ax=axs)
lons = val_pivot_df.columns.values
lats = val_pivot_df.index.values
data_values = val_pivot_df.values
masked_data = np.ma.masked_invalid(data_values)
lon, lat = np.meshgrid(lons, lats)
xi, yi = mm(lon, lat)
cs = mm.pcolor(xi,yi,masked_data,shading='auto')
fig.colorbar(cs, ax=axs, shrink=0.8, pad=0.15, label='any_text')
# add shp file as coastline
# mm.readshapefile('/mnt/c/Users/C0060017/Documents/Taught_Material/MRes_Dissertation/Dissertation/MRes_dataset/active_data/101_admin/uk_admin_boundary_py_nasa_pp_countryOutlineFromGiovanni', 'uk_admin_boundary')
# Map properties set up
merid = mm.drawmeridians(
    np.arange(-180, 180, 2), 
    labels=[False, False, False, True])
parall = mm.drawparallels(
    np.arange(0, 160), 
    labels=[True, True, False, False])

plt.show()


# filterinfDataframe = df[(df['longitude'] == -9.0) & (df['latitude'] == 61.0) ]
# filterinfDataframe


#### 2.3. Data distribution

Here we are plotting the distribution of the mean daily precipitation for each year - *The plotted dataset contains the daily mean value for each year at each grid cell*

In [None]:
# Get average value by season
ERA_season_mean = dataset_ERA.groupby('time.season').mean('time')

# Change to dataframe
df_era_season = ERA_season_mean.to_dataframe().reset_index()

test = df_era_season[(df_era_season["season"] == 'DJF')]
test2 = df_era_season[(df_era_season["season"] == 'MAM')]
test3 = df_era_season[(df_era_season["season"] == 'JJA')]
test4 = df_era_season[(df_era_season["season"] == 'SON')]
test_data = [test['tp'], test2['tp'], test3['tp'], test4['tp']]
x = [1,2,3,4]

print(df_era_season.shape[0])

fig, axes = plt.subplots(figsize=(30,15))
# axes.violinplot(dataset = [test['tp'],test2['tp'], test3['tp'], test4['tp']])
axes.violinplot([test['tp'],test2['tp'], test3['tp'], test4['tp']], showmeans=True, showmedians=False, showextrema=True, points=10000)

# x-axis labels
axes.set_xticks(x)
axes.set_xticklabels(['DJF', 'MAM','JJA', 'SON'])

plt.show()



# df = df_era_season.set_index(['season'])
# df
# grouped = df['tp'].groupby(level='season')
# grouped.boxplot(rot=45, fontsize=12, figsize=(8,10))

In [None]:
# Get average value by season
ERA_yearly_mean = dataset_ERA.groupby('time.year').mean('time')
GPM_yearly_mean = dataset_GPM.groupby('time.year').mean('time')
HAD_yearly_mean = dataset_HAD.groupby('time.year').mean('time')

# Change to dataframe
df_era_yearly = ERA_yearly_mean.to_dataframe().reset_index()
df_gpm_yearly = GPM_yearly_mean.to_dataframe().reset_index()
df_had_yearly = HAD_yearly_mean.to_dataframe().reset_index()

# For HadUK NaN values need to be removed
df_had_yearly_final = df_had_yearly.dropna(subset=['rainfall'], how='all')

# integer for x axis
x = [*range(1,len(df_era_yearly['year'].unique()) +1, 1)]

# Create list to store data for the graph
dataset_lst_ERA=[]
dataset_lst_GPM=[]
dataset_lst_HAD=[]

# Create graph datasets
for yr in [*range(2001,2020,1)]:
    dataset_lst_ERA.append(df_era_yearly[(df_era_yearly["year"] == yr)]['tp'])
    dataset_lst_GPM.append(df_gpm_yearly[(df_gpm_yearly["year"] == yr)]['precipitationCal'])
    dataset_lst_HAD.append(df_had_yearly_final[(df_had_yearly_final["year"] == yr)]['rainfall'])

# Create plots
fig, axs = plt.subplots(2, 1, figsize=(50,50))

# HadUK-Grid and ERA5
vp_era = axs[0].violinplot(dataset=dataset_lst_ERA, showmeans=True, showmedians=False, showextrema=True)
vp_had = axs[0].violinplot(dataset=dataset_lst_HAD, showmeans=True, showmedians=False, showextrema=True)
plot_setup(axs[0],'HAD','ERA')
# change colour of violin o match other graphs
violin_clr(vp_had, '#a65628')
violin_clr(vp_era, '#377eb8')
# # saving image
# file_name = 'HADUK-ERA5_Year_Mean_Daily_Distribution.png'
# saving_image(axs[0], fldr_images, file_name)

# HadUK-Grid and GPM-IMERG
vp_gpm = axs[1].violinplot(dataset=dataset_lst_GPM, showmeans=True, showmedians=False, showextrema=True)
vp_had = axs[1].violinplot(dataset=dataset_lst_HAD, showmeans=True, showmedians=False, showextrema=True)

plot_setup(axs[1],'HAD','GPM-IMERG')
# change colour of violin o match other graphs
violin_clr(vp_had, '#a65628')
violin_clr(vp_gpm, '#4daf4a')
# # saving image
# file_name = 'HADUK-GPM-IMERG_Year_Mean_Daily_Distribution.png'
# saving_image(axs[1], fldr_images, file_name)


plt.show()
# Make sure it show a nice layout avoiding overlapping
plt.tight_layout()

#### 2.3.1. Descriptive statistics

Here we get the individual tables showing the descriptive characteristics.

In [None]:
# Create dataframe using the data for each year - These data was used in the violin plots
dataset_lst_ERA
dataset_lst_GPM
dataset_lst_HAD

# Conver to pandas dataframe
ERA = pd.DataFrame(list(map(np.ravel, dataset_lst_ERA)))
GPM = pd.DataFrame(list(map(np.ravel, dataset_lst_GPM)))
HAD = pd.DataFrame(list(map(np.ravel, dataset_lst_HAD)))

# Get descriptive statistics for each year and all datasets
ERA_stats = ERA.apply(pd.Series.describe, axis=1)
GPM_stats = GPM.apply(pd.Series.describe, axis=1)
HAD_stats = HAD.apply(pd.Series.describe, axis=1)

dfs = [ERA_stats, GPM_stats, HAD_stats]

for df in dfs:
    # Add years as column
    df['years'] = [*range(2001,2020,1)]
    # Shift column 'year' to first position
    first_column = df.pop('years')
    # insert column using insert(position,column_name,first_column) function
    df.insert(0, 'years', first_column)