# Exploring the TCLV dataset

This notebook intends to give a cursory inspection of the TCLV data provided by Department of Environment and Science (DES) as part of the Severe Wind Hazard Assessment (SWHA) for Queensland project. 

The TCLV data has been extracted from a suite of 11 general circulation models, representing two representative concentration pathways (RCPs - RCP8.5 and RCP4.5; [ref]). There is an additional member of this suite, which is based on a dynamical downscaling of the ERA Interim reanalysis ([ref]). 

The TCLV data are provided in a common file format (described below), which includes location, date, time and intensity information. We can use this data as input for a TC hazard assessment for future climate scenarios. 

Firstly though, we want to understand what the TCLV data is suggesting about future tropical cyclone behaviour in the Queensland region. We do this by exploring the relative frequency and intensity distributions of the TCLV sets, split into separate periods (1981-2010 and 2081-2100). While the SWHA project will produce hazard information for 20-year periods centred on 2030, 2050, 2070 and 2090, we will here only look at the changes that are apparent by the end of the century.

We start this by importing the required libraries for the analysis. 

In [None]:
%matplotlib inline

import os
from os.path import join as pjoin
from matplotlib import pyplot as plt

import re
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import cartopy.crs as ccrs
import statsmodels.api as sm

from sklearn.cluster import KMeans

sns.set_context("poster")

Define a number of functions to manipulate the data

In [None]:
def load_track_file(filename):
    """
    Load a TCLV file into a :class:`pandas.DataFrame`, and add a field 
    representing the age of each TCLV in hours, and the pressure difference.
    
    :param str filename: Path to a TCLV data file
    
    :returns: :class:`pandas.DataFrame`
    """
    # This assumes the format of the TCLV files is identical
    columns = ['num', 'year', 'month', 'day', 'hour', 'lon', 'lat',
               'pmin', 'vorticity', 'vmax', 'tanomsum','tanomdiff',
               'pmslanom', 'poci', 'reff','ravg','asym']
    df = pd.read_csv(filename, delimiter=' ', skipinitialspace=True,
                     names=columns, parse_dates={'datetime':[1,2,3,4]},
                     keep_date_col=True, 
                     dtype={'year':int, 'month':int, 'day':int})
    df['dt'] = df.groupby('num')['datetime'].apply(lambda x: x.diff())
    df['dt'] = df['dt'].transform(lambda x: x.total_seconds())

    df['age'] = df.groupby('num')['dt'].apply(np.cumsum).fillna(0)/3600.
    # Throw in the pressure deficit for good measure:
    df['pdiff'] = df['poci'] - df['pmin']
    return df

def filter_tracks(df, start_year=1980, end_year=2010, zeta=0, age=36):
    """
    Takes a `DataFrame` and filters on the basis of a prescribed vorticity 
    threshold, lifetime and a given time period.
    
    :param df: :class:`pandas.DataFrame` that holds the TCLV data
    :param int start_year: Starting year of the time period to filter
    :param int end_year: End year of the period to filter
    :param float zeta: Vorticity threshold to filter the TCLV data. 
                       This can be a positive value, as we filter on the
                       absolute value of the field.
    :param int age: Minimum age of the TCLVs in hours
    
    """
    tracks = df.groupby('num')
    filterdf = tracks.filter(lambda x: (x['datetime'].dt.year.min() >= start_year) &\
                                       (x['datetime'].dt.year.max() <= end_year) &\
                                       (x['age'].max() >= age) &\
                                       (np.abs(x['vorticity'].min()) > zeta))
    return filterdf

def calculate_frequency(df, start_year, end_year):
    """
    Calculate the average frequency of TCLVs between `start_year` and `end_year`
    
    :param df: :class:`pandas.DataFrame` of TCLV data
    :param int start_year: Start year of the time period to calculate the frequency
    :param int end_year: End year of the time period to calculate the frequency.
    
    :returns: Mean frequency of TCLV occurrence
    """
    pertracks = filter_tracks(df, start_year, end_year)
    annual_count = pertracks.groupby('year').num.nunique().reset_index()
    frequency = annual_count.mean().num
    return frequency

def quantiles(df, varname='pmin' ):
    """
    Calculate quantiles for a given variable in the dataframe
    """
    if len(df) == 0:
        return np.zeros(100)
    q = np.quantile(df[varname], np.arange(0,1,0.01))

    return q

In [None]:
def plotsummary(df, title, filename=None):
    fig = plt.figure(figsize=(10,10))
    fig.suptitle(title)
    ax1 = plt.subplot(2,2,1)
    pmindata = df.loc[df.groupby("num")["pmin"].idxmin()]
    sns.histplot(pmindata['pmin'], ax=ax1)
    for x in pmindata['pmin'].quantile([.1, .25, .5, .75, 0.9]).values:
        ax1.axvline(x,  ls='--',color='k')
    ax1.set_xlabel("Minimum central pressure (hPa)")
    
    ax2 = plt.subplot(2,2,2)

    vmaxdata = df.loc[df.groupby("num")["vmax"].idxmax()]
    sns.histplot(vmaxdata['vmax'], ax=ax2)
    for x in vmaxdata['vmax'].quantile([.1, .25, .5, .75, 0.9]).values:
        ax2.axvline(x,  ls='--',color='k')
    ax2.set_xlabel('Wind speed (m/s)')
    
    ax3 = plt.subplot(2,2,3)
    annual_count = df.groupby('year').num.nunique().reset_index()
    sns.regplot(x='year',y='num',data=annual_count, ax=ax3)
    frequency = annual_count.mean().num
    ax3.axhline(frequency, ls='--')
    
    ax4 = plt.subplot(2,2,4,projection=ccrs.PlateCarree())
    
    ax4.coastlines()
    sns.kdeplot(x=df.lon,y=df.lat, ax=ax4)
    ax4.set_xlim((90,180))
    ax4.set_ylim((-40, 0))
    plt.tight_layout()
    if filename:
        plt.savefig(filename, bbox_inches='tight')

In [None]:
path = "C:/WorkSpace/data/tclv/tracks/"
regex = r'all_tracks_(.+)_(rcp\d+)\.dat'
f = "all_tracks_ERAIntQ_rcp85.dat"
m = re.match(regex, f)
model, rcp = m.group(1, 2)
filename = pjoin(path, f)
df = load_track_file(filename)


In [None]:
plotsummary(filter_tracks(df, zeta=0, age=36), title="ERA Interim", 
            filename="C:/WorkSpace/data/tclv/figures/ERAInt.png")

In [None]:
path = "C:/WorkSpace/data/tclv/tracks/"
regex = r'all_tracks_(.+)_(rcp\d+)\.dat'
data = {}
files = [f for f in os.listdir(path) if os.path.isfile(pjoin(path, f))]
for f in files:
    if f=="all_tracks_ERAIntQ_rcp85.dat":
        continue
    if f.endswith(".png"):
        continue
        
    print(f)
    m = re.match(regex, f)
    model, rcp = m.group(1, 2)
    filename = pjoin(path, f)
    df = load_track_file(filename)
    label = "{0} {1}".format(model, rcp.upper())
    data[label]=df
    


In [None]:
freqdf = pd.DataFrame(columns=['model', 'RCP', 'current_freq', 'future_freq'])
for m, df in data.items():
    current_freq = calculate_frequency(df, 1980, 2010)
    future_freq = calculate_frequency(df, 2080, 2100)
    model, rcp = m.split(' ')
    freqdf = freqdf.append({'model':model, 'RCP':rcp, 'current_freq':current_freq, 
                            'future_freq':future_freq}, ignore_index=True)
    
    print("{0}: current: {1:.2f} | future: {2:.2f}".format(m, current_freq, future_freq))
    
kmeans = KMeans(n_clusters=2, random_state=0).fit(freqdf[['current_freq', 'future_freq']].values)
freqdf['cluster'] = kmeans.labels_


g1cf = freqdf[freqdf['cluster']==1]['current_freq'].mean()
g1ff = freqdf[freqdf['cluster']==1]['future_freq'].mean()

g2cf = freqdf[freqdf['cluster']==0]['current_freq'].mean()
g2ff = freqdf[freqdf['cluster']==0]['future_freq'].mean()

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.scatterplot('current_freq', 'future_freq', data=freqdf, hue='model', style='RCP',palette=sns.color_palette("Paired",11))
ax.set_xlabel("Current frequency (TCLVs/year)")
ax.set_ylabel("Projected frequency (TCLVs/year)")
ax.set_xlim((0,25))
ax.set_ylim((0,25))
ax.plot(np.arange(0,25), np.arange(0,25), 'r--')
ax.scatter(g1cf, g1ff, label="Group 1", color='k', marker='*')
ax.scatter(g2cf, g2ff, label="Group 2", color='k', marker='+')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig("C:/WorkSpace/data/tclv/figures/frequency_projection.png", bbox_inches='tight')

There are two clusters of models here, one that starts around 20 TCs/year in current climate, and another that is around 12-14 TCs/year. Both clusters show (on average) a decline in TC frequency. Two models indicate a (small) increase in TC frequency, and two no change. 

Interestingly, there's no apparent relationship with the RCP (4.5 is shown with dots, 8.5 is with crosses). Maybe there's a tendancy for a greater reduction in frequency, but it's unlikily to be statistically significant (moreso because the number of simulations is too low to give this any significance).

In [None]:
freqdf[freqdf['cluster']==0][['model', 'RCP','current_freq', 'future_freq']]

In [None]:
freqdf[freqdf['cluster']==1][['model', 'RCP','current_freq', 'future_freq']]

The mean change in frequency of each cluster is shown below. For both clusters, the RCP 8.5 simulations project a greater reduction in TC frequency compared to the RCP 4.5.  

In [None]:
freqdf.groupby(['cluster', 'RCP']).apply(lambda d: 1.0 - np.mean(d.future_freq/d.current_freq))

In [None]:
clusters = freqdf.groupby(['cluster', 'RCP'])[['model', 'RCP', 'current_freq', 'future_freq']]
for key, item in clusters:
    print(key, "\n")
    print(clusters.get_group(key), "\n\n")

In [None]:
qdf = pd.DataFrame(columns=['model', 'RCP', 'q', 'cq', 'fq'])
for m, df in data.items():
    print(m)
    cq = quantiles(filter_tracks(df, 1980, 2010))
    fq = quantiles(filter_tracks(df, 2080, 2100))
    model, rcp = m.split(' ')
    for i,q in enumerate(np.arange(0,1,0.01)):
        qdf = qdf.append({'model':model, 'RCP':rcp, 'q':q,
                      'cq':cq[i], 
                      'fq':fq[i]}, 
                     ignore_index=True)


In [None]:
g = sns.lmplot('cq','fq',data=qdf[qdf['RCP']=='RCP85'],hue='model',palette=sns.color_palette("Paired",11),
              fit_reg=False,scatter_kws={'s':5})
    
#dir(g)
g.set_xlabels("Current climate")
g.set_ylabels("Future climate")
g.ax.set_xlim((940, 1010))
g.ax.set_ylim((940,1010))
plt.plot(np.arange(900,1010), np.arange(900,1010),c='r',ls='--',zorder=0)

A quantile-quantile plot of the minimum pressures for current and future climates suggests there may be a change in the distribution. The lowest quantiles (i.e. lower pressure values) appear to tend below the 1-to-1 line, indicating that the most intense TCLVs in the future are more intense than those in the current climate simulation.

Maybe the same for maximum winds will be more revealing.

In [None]:
vdf = pd.DataFrame(columns=['model', 'RCP', 'q', 'cq', 'fq'])
for m, df in data.items():
    print(m)
    cq = quantiles(filter_tracks(df, 1980, 2010), 'vmax')
    fq = quantiles(filter_tracks(df, 2080, 2100), 'vmax')
    model, rcp = m.split(' ')
    for i,q in enumerate(np.arange(0,1,0.01)):
        vdf = vdf.append({'model':model, 'RCP':rcp, 'q':q,
                      'cq':cq[i], 
                      'fq':fq[i]}, 
                     ignore_index=True)

In [None]:
g = sns.lmplot('cq','fq',data=vdf[vdf['RCP']=='RCP45'],hue='model',palette=sns.color_palette("Paired",11),
              fit_reg=False,scatter_kws={'s':5}, )
    
g.ax.set_xlabel("Current climate")
g.ax.set_ylabel("Future climate")
g.ax.set_xlim((0, 100))
g.ax.set_ylim((0,100))
g.ax.plot(np.arange(0,100), np.arange(0,100), 'r--',zorder=0)

And it is. The figure is flipped here, but there is a clear indication that the distribution of the maximum wind speed of TCLVs in the future climate simulations is more intense than the current climate. Nearly all the models sit to the left of the 1-to-1 line, with a greater shift at higher wind speeds. 

This qualitatively matches the generalised statement about increasing proportion of the most intense cyclones. These QQ plots say nothing about the frequency (see above), and hence the overall likelihood of extreme TCs. 

In [None]:
g = sns.pairplot(vdf, hue='RCP')

In [None]:
for m, df in data.items():
    plotsummary(filter_tracks(df, start_year=1981, end_year=2010, zeta=0, age=36), 
                title="{0} ({1} - {2})".format(m, 1981, 2010),
                filename=pjoin("C:/WorkSpace/data/tclv/figures/20190919/",
                               "{0}_1981_2010.png".format(m.replace(' ','_'))))

In [None]:
for m, df in data.items():
    plotsummary(filter_tracks(df, start_year=2081, end_year=2100, zeta=0, age=36), 
                title="{0} ({1} - {2})".format(m, 2081, 2100),
                filename=pjoin("C:/WorkSpace/data/tclv/figures/20190919/",
                               "{0}_2081_2100.png".format(m.replace(' ','_'))))

In [None]:
annual_count = df.groupby('year').num.nunique().reset_index()
X = annual_count.year
X = sm.add_constant(X)
y = annual_count.num
results = sm.OLS(y, X).fit()
print(results.summary())

In [None]:
results.params