In [None]:
import gnssmapper as gm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from scipy import interpolate

In [None]:
names=['ucl','goodchild','hermitage']
heights=[46,47,34]
samples_list=[pd.read_csv('../data/'+name+'_exact_samples.csv',index_col=0) for name in names]
for s,n,h in zip(samples_list,names,heights):
    s['location']=n
    s['height']=h

samples = pd.concat(samples_list,axis=0)


In [None]:
samples.groupby('location').I_inf.max()

In [None]:
sns.kdeplot(samples.I_inf)#,bins=[10**i for i in np.linspace(2,6,20)])
plt.xscale('log')
plt.xlim(100,10**6)
plt.xlabel('Sample Size')
plt.savefig('../figures/sample_size_distribution.png')


In [None]:
samples.describe()


In [None]:
samples_long=samples.melt(id_vars=('est_lb','est','est_ub','height','location'),var_name="metric",value_name='size')
samples_long['error']=abs(samples_long.est-samples_long.height)
samples_long

Intersection Metrics and Window metrics have an inverse relationship

In [None]:
plt.scatter(samples.I_4,1/samples.W_100)
plt.xlabel("I_4 metric")
plt.ylabel("1/W_100 metric")
plt.savefig('../figures/metrics_inverse_relationship.png')


In [None]:
plt.scatter(samples.I_4,1/samples.W_1)
plt.xlabel("I_4 metric")
plt.ylabel("W_1 metric")

In [None]:
sns.heatmap(pd.concat((samples.iloc[:,:6],1/samples.iloc[:,6:10]),axis=1).corr())
plt.savefig('../figures/metrics_correlation.png')

W_100000 quite different at the data sizes we are working with, because of the level of inf observations

In [None]:
I=samples_long[samples_long.metric.str.startswith('I')].groupby(['location','metric','size'], as_index=False).error.agg(lambda x: (np.sum(x**2)/len(x))**0.5)

In [None]:
a=[]
for _,g in I.groupby(['location','metric']):
    # b=interpolate.UnivariateSpline(g['size'], g['error'],k=3,s=2000)
    t=np.exp(np.log(np.ma.masked_invalid(g['size'].values)).mean())
    b=interpolate.LSQUnivariateSpline(g['size'], g['error'],k=3,t=[10,100,1000])
    new_error=b(g['size'],nu=0)
    a.extend(new_error)
I['smooth']=a

In [None]:
df=[]
for _,g in samples_long[samples_long.metric.str.startswith('I')].groupby(['location','metric']):
    a = g.sort_values('size')
    # o=sm.nonparametric.lowess(g['error'].values,np.log(np.maximum(g['size'].values,0.5)),frac=0.25)
    o=sm.nonparametric.lowess(a['error'].values,a['size'].values,frac=0.25)
    # print(g['size'].values-o[:,0])
    _,idx=np.unique(a['size'].values,return_index=True)
    # _,idx2=np.unique(o[:,0],return_index=True) #to ensure np.nan treated as unique
    # print(len(idx))
    df.extend(o[idx,1])
I['lowess']=np.array(df)

In [None]:
# I_long=I.melt(id_vars=['location','metric','size'],value_vars=['smooth','lowess'],var_name="smoother",value_name="smoothed_error")
# I_long

In [None]:
# g=sns.FacetGrid(I_long,col='metric',row='smoother')
# g.map_dataframe(sns.lineplot,x='size',y='smoothed_error',hue='location',legend='full')
# g.add_legend()
# plt.xscale('log')
# plt.ylim(0,20)

In [None]:
g=sns.FacetGrid(I,col='metric',col_wrap=3,col_order=['I_1','I_2','I_4','I_8','I_16','I_inf'])
g.map_dataframe(sns.lineplot,x='size',y='lowess',hue='location',legend='auto',size=4)
g.map_dataframe(sns.scatterplot,x='size',y='error',hue='location',legend='auto',alpha=0.1,size=0.1)
g.add_legend()
plt.xscale('log')
plt.ylim(0,30)
plt.savefig('../figures/rmse_heights.png')

In [None]:
W_=samples_long[samples_long.metric.str.startswith('W')].copy()
W_['size']=1/W_['size']
W=W_.groupby(['location','metric','size'], as_index=False).error.agg(lambda x: (np.sum(x**2)/len(x))**0.5)

In [None]:
g=sns.FacetGrid(W,col='metric')
g.map_dataframe(sns.scatterplot,x='size',y='error',hue='location',legend='full')
g.add_legend()
plt.xscale('log')
plt.ylim(1,1000)
plt.yscale('log')

In [None]:
W[['metric','location','size']].value_counts().count()

In [None]:
W

Dropping 

In [None]:
df=[]
for _,g in W.groupby(['location','metric']):
    a=g.sort_values('size')
    # b=interpolate.UnivariateSpline(a['size'], a['error'],k=3,s=100000)
    t=np.ma.masked_invalid(a['size'].values).mean()
    try:
        b=interpolate.LSQUnivariateSpline(a['size'], a['error'],k=3,t=[t])
        new_error=b(a['size'],nu=0)
        df.extend(new_error)
    except:
        df.extend(
            np.ones_like(a['size'].values) *
            np.ma.masked_invalid(a['error'].values).mean()
            )
W['smooth']=df

In [None]:
df=[]
for _,g in W_.groupby(['location','metric']):
    a = g.sort_values('size')
    # o=sm.nonparametric.lowess(g['error'].values,np.log(np.maximum(g['size'].values,0.5)),frac=0.25)
    o=sm.nonparametric.lowess(a['error'].values,a['size'].values,frac=0.5)
    _,idx=np.unique(a['size'].values,return_index=True)
    # _,idx2=np.unique(o[:,0],return_index=True) #to ensure np.nan treated as unique
    df.extend(o[idx,1])
W['lowess']=np.array(df)

In [None]:
W

In [None]:
W_long=W.melt(id_vars=['location','metric','size'],value_vars=['smooth','lowess'],var_name="smoother",value_name="smoothed_error")
W_long

In [None]:
g=sns.FacetGrid(W_long,col='metric',row='smoother')
g.map_dataframe(sns.lineplot,x='size',y='smoothed_error',hue='location',legend='full')
g.add_legend()
plt.xscale('log')
plt.ylim(0,20)

In [None]:
J=samples.copy()
J['J_2']=J.I_2/np.maximum(J.I_1,1)
J['J_4']=J.I_4/np.maximum(J.I_2,1)
J['J_8']=J.I_8/np.maximum(J.I_4,1)
J['J_16']=J.I_16/np.maximum(J.I_8,1)
J['J_inf']=J.I_inf/np.maximum(J.I_16,1)
J_long=J.melt(id_vars=('est_lb','est','est_ub','height','location'),var_name="metric",value_name='size')
J_long['error']=abs(J_long.est-J_long.height)
J_long=J_long[J_long.metric.str.startswith('J')]

In [None]:
J_long['size'].describe()

In [None]:
plt.hist(J_long['size'],bins=range(20))

In [None]:
R=J_long.groupby(['location','metric','size'], as_index=False).error.agg(lambda x: (np.sum(x**2)/len(x))**0.5)
R

In [None]:
df=[]
for _,g in J_long.groupby(['location','metric']):
    a = g.sort_values('size')
    # o=sm.nonparametric.lowess(g['error'].values,np.log(np.maximum(g['size'].values,0.5)),frac=0.25)
    o=sm.nonparametric.lowess(a['error'].values,a['size'].values,frac=0.45)
    # print(g['size'].values-o[:,0])
    _,idx=np.unique(a['size'].values,return_index=True)
    # _,idx2=np.unique(o[:,0],return_index=True) #to ensure np.nan treated as unique
    # print(len(idx))
    df.extend(o[idx,1])
R['lowess']=np.array(df)
R

In [None]:
g=sns.FacetGrid(R,col='metric',col_wrap=3,col_order=['J_2','J_4','J_8','J_16','J_inf'])
g.map_dataframe(sns.lineplot,x='size',y='lowess',hue='location',legend='auto',size=4)
g.map_dataframe(sns.scatterplot,x='size',y='error',hue='location',legend='auto',alpha=0.1,size=0.1)
g.add_legend()
plt.xscale('log')
plt.ylim(1,1000)
plt.yscale('log')
plt.savefig('../figures/metric_usefulness.png')

In [None]:
R_long=R.melt(id_vars=['location','metric','size'],value_vars=['lowess'],var_name="smoother",value_name="smoothed_error")
R_long

In [None]:
g=sns.FacetGrid(R_long,col='metric',row='smoother')
g.map_dataframe(sns.lineplot,x='size',y='smoothed_error',hue='location',legend='full')
g.add_legend()
plt.xscale('log')
plt.ylim(0,20)