In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import scipy.optimize as optimize

#### Load in and process

In [None]:
df = pd.read_excel('data/save_file.xlsx')
print(df['compute'].isna().sum())
df.replace('',np.nan,inplace=True)
print(df['compute'].isna().sum())

In [None]:
##### Add columns and sort out 'poss 1e23/1e25'

# Reset the ones we've set
df.loc[~df["compute"].isna(), "poss1e23"] = np.nan
df.loc[~df["compute"].isna(), "poss1e25"] = np.nan

# Set some temporary placeholder values
# TODO: revisit
# df.loc[(df["poss1e25"] == "checked"), "compute"] = 1.01e25  # placeholder
# df.loc[((df["poss1e23"] == "checked") & (df["poss1e25"] != "checked")), "compute"] = 1.01e23  # placeholder

# We want to handle these leading models manually via the above compute estimates.
assert df[(df["poss1e25"] == "checked") & (df["compute"].isna())].size == 0

# We sample 1e23-1e25 models with unknown compute from the existing empirical distribution.
# TODO: revisit
poss1e23 = ((df["poss1e23"] == "checked") & (df["poss1e25"] != "checked"))
df.loc[poss1e23, "compute"] = df[(df["compute"] >= 1e23) & (df["compute"] < 1e25)]["compute"].sample(poss1e23.sum(), random_state=0).values



##
df["date"] = pd.to_datetime(df["date"])

df['compute'] = pd.to_numeric(df['compute'],errors='coerce')
df["log_compute"] = np.log10(df["compute"])

df["date_float"] = df["date"].dt.year + df["date"].dt.month/12

df = df.sort_values("date")

In [None]:
## exp counts fit

from scipy.stats import t

def exp_pred_counts(years,year_counts,future_years,alpha=0.10):
    mapped_years = np.arange(0,len(year_counts)).astype('float')

    def exp_fit(x,a,b):
        return a*np.exp(b*x)
    
    popt,pcov = optimize.curve_fit(exp_fit,mapped_years.astype(float),year_counts.values.astype(float))
    pred_counts = exp_fit(future_years-years[0],*popt).astype(int) 

    #calculatiing confidence bounds
    #assuming log normal uncertainty, 90% CI
    pred_counts_fit = exp_fit(mapped_years,*popt)
    log_pred_counts_fit = np.log(pred_counts_fit)
    log_obs_counts = np.log(year_counts.values.astype(float))
    residuals = log_pred_counts_fit - log_obs_counts #we're calculating residuals of log counts 
    SEP = np.sqrt(np.sum(residuals**2)/(len(year_counts-2)))


    alpha = alpha #90% conf interval
    dof = len(year_counts) - 2 #apparenlty for linear function the dof is n-2
    crit_t_value = stats.t.ppf(1-alpha/2,dof)
    pred_delta = crit_t_value*SEP 

    years_all = np.concatenate([years,future_years])
    preds_all = np.concatenate([pred_counts_fit,pred_counts])
    log_pred_UB = np.log(preds_all)+pred_delta
    log_pred_LB = np.log(preds_all)-pred_delta
    pred_counts_UB = np.exp(log_pred_UB)
    pred_counts_LB = np.exp(log_pred_LB)

    return years_all,preds_all,pred_counts_UB,pred_counts_LB

#### Exponential counts fit

In [None]:
start_date = '2017-01-01'
print(len(df))
df_date_filtered = df[df['date']>start_date]
print(len(df_date_filtered))

REMOVE_2024=True

df_date_filtered['year'] = df_date_filtered['date'].dt.year
year_counts = df_date_filtered['year'].value_counts().sort_index()
future_years = np.arange(2024,2029)

if REMOVE_2024: year_counts = year_counts.loc[2017:2023]

years = year_counts.index.to_numpy(dtype=float)
mapped_years = np.arange(0,len(year_counts)).astype('float')



In [None]:
def exp_fit(x,a,b):
    return a*np.exp(b*x)

from scipy import optimize


popt,pcov = optimize.curve_fit(exp_fit,mapped_years.astype(float),year_counts.values.astype(float))
pred_counts = exp_fit(future_years-years[0],*popt).astype(int)


In [None]:
fig,ax=plt.subplots()
ax.plot(years,year_counts,label='obs',color='red')
ax.plot(future_years,pred_counts,label='pred',color='tab:blue')

ax.grid(True,alpha=0.6)
ax.legend()

In [None]:
pred_counts_fit = exp_fit(mapped_years,*popt)

log_pred_counts_fit = np.log(pred_counts_fit)
log_obs_counts = np.log(year_counts.values.astype(float))
residuals = log_pred_counts_fit - log_obs_counts #we're calculating residuals of log counts
SEP = np.sqrt(np.sum(residuals**2)/(len(year_counts-2)))


alpha = 0.10 #90% conf interval
dof = len(year_counts) - 2 #apparenlty for linear function the dof is n-2
from scipy.stats import t
crit_t_value = stats.t.ppf(1-alpha/2,dof)
pred_delta = crit_t_value*SEP 


years_all = np.concatenate([years,future_years])
preds_all = np.concatenate([pred_counts_fit,pred_counts])
log_pred_UB = np.log(preds_all)+pred_delta
log_pred_LB = np.log(preds_all)-pred_delta
pred_UB = np.exp(log_pred_UB)
pred_LB = np.exp(log_pred_LB)

fig,ax=plt.subplots(figsize=(10,6))
ax.plot(years_all,preds_all,label='pred',color='tab:blue')
ax.plot(years,year_counts.values,label='obs',color='tab:red')
ax.fill_between(years_all,pred_LB,pred_UB,color='gray',alpha=0.2,label='90% CI')
ax.set_title('Predicted model counts with 90% CI')

ax.set_xticks(years_all)
ax.legend()
ax.grid(alpha=0.6)

In [None]:
from scipy.stats import t

def exp_pred_counts(years,year_counts,future_years):
    mapped_years = np.arange(0,len(year_counts)).astype('float')

    def exp_fit(x,a,b):
        return a*np.exp(b*x)
    
    popt,pcov = optimize.curve_fit(exp_fit,mapped_years.astype(float),year_counts.values.astype(float))
    pred_counts = exp_fit(future_years-years[0],*popt).astype(int) 

    #calculatiing confidence bounds
    #assuming log normal uncertainty, 90% CI
    pred_counts_fit = exp_fit(mapped_years,*popt)
    log_pred_counts_fit = np.log(pred_counts_fit)
    log_obs_counts = np.log(year_counts.values.astype(float))
    residuals = log_pred_counts_fit - log_obs_counts #we're calculating residuals of log counts 
    SEP = np.sqrt(np.sum(residuals**2)/(len(year_counts-2)))


    alpha = 0.10 #90% conf interval
    dof = len(year_counts) - 2 #apparenlty for linear function the dof is n-2
    crit_t_value = stats.t.ppf(1-alpha/2,dof)
    pred_delta = crit_t_value*SEP 

    years_all = np.concatenate([years,future_years])
    preds_all = np.concatenate([pred_counts_fit,pred_counts])
    log_pred_UB = np.log(preds_all)+pred_delta
    log_pred_LB = np.log(preds_all)-pred_delta
    pred_counts_UB = np.exp(log_pred_UB)
    pred_counts_LB = np.exp(log_pred_LB)

    return years_all,preds_all,pred_counts_UB,pred_counts_LB

#### Fitting KDEs and counting models

In [None]:
import seaborn as sns
from scipy.stats import gaussian_kde,norm,linregress
from scipy.optimize import minimize
from sklearn.linear_model import LinearRegression
import sys

In [None]:
data_fit = None #var place holder
mu_0 = None
def trunc_norm_NLL(sigma): #NLL assuming we're sampling from  truncated normal 
    ll = norm.logpdf(data_fit.to_numpy(),mu_0,sigma) - np.log(1-norm.cdf(data_fit.to_numpy().min(),mu_0,sigma))
    return -np.sum(ll)

In [None]:
## scatter
REMOVE_2024=True

start_date = '2017-01-01'
end_date = '2024-01-01'

if REMOVE_2024:
    df_date_filtered = df[(df['date']>start_date) & (df['date']<end_date)]
else: 
    df_date_filtered = df[df['date']>start_date]

df_date_filtered['year'] = df_date_filtered['date'].dt.year
df_date_filtered = df_date_filtered[~df_date_filtered['log_compute'].isna()] #remove Nan compute rows

#lin regress for means
grouped_df_mean = df_date_filtered.groupby(['year'])['log_compute'].mean().reset_index()

X = grouped_df_mean['year'].values
y = grouped_df_mean['log_compute'].values

mean_log_compute_model = LinearRegression()
mean_log_compute_model.fit(X.reshape(-1,1),y)



#plot
fig_S,ax_S=plt.subplots()
#sns.regplot(data=df_date_filtered,x='date_float',y='log_compute',color='tab:blue')
sns.stripplot(x='year',y='log_compute',data=df_date_filtered,jitter=True,ax=ax_S)
sns.regplot(x=pd.Categorical(df_date_filtered['year']).codes,y='log_compute',data=df_date_filtered,scatter=False,ax=ax_S)
ymin,ymax = ax_S.get_yticks().min(),ax_S.get_yticks().max()
ax_S.set_yticks(np.arange(ymin,ymax,1))
#ax_S.axhline(y=23.7)

#2024 regression line at ~23.7

In [None]:
start_date = '2017-01-01'
print(len(df))
df_date_filtered = df[df['date']>start_date]
print(len(df_date_filtered))
df_date_filtered['year'] = df_date_filtered['date'].dt.year

PLOT=True
FIT_2024_RT = True
mean_2024 = mean_log_compute_model.predict(np.array(2024).reshape(-1,1))[0]


start_year=2017
end_year = 2024 

years = np.arange(start_year,end_year+1)
log_compute_min = 10
log_compute_max = 28
x = np.linspace(log_compute_min,log_compute_max,1000)


nrows = 4
ncols = 2
fig,axs = plt.subplots(nrows=nrows,ncols=ncols,figsize=(8,10))
axs_ravel = np.ravel(axs)
MEAN,VAR= [],[]



for idx,year in enumerate(years):
    try:ax=axs_ravel[idx]
    except: pass


    year_filtered_df = df_date_filtered[df_date_filtered['year']==year]
    nan_frac =(year_filtered_df['log_compute'].isna().sum())/len(year_filtered_df)
    year_filtered_df = year_filtered_df[~year_filtered_df['log_compute'].isna()]
    log_compute_data = year_filtered_df['log_compute']

    if year==2024:
        #find sigma from right tail
        if FIT_2024_RT:
            RT_filtered_data = log_compute_data[log_compute_data>mean_2024]
            #print(len(RT_filtered_data)/len(log_compute_data)) #sanity check
            init_sigma = np.std(RT_filtered_data)
            data_fit = RT_filtered_data
            mu_0 = mean_2024
            sigma = (minimize(trunc_norm_NLL,[init_sigma])).x

            mean,sigma=mu_0,sigma #variable renaming
            MEAN.append(mean)
            VAR.append(sigma[0]**2)

    #find means in standard way
    else: 
        mean,std = np.mean(log_compute_data),np.std(log_compute_data)
        MEAN.append(mean); VAR.append(std**2)
        kde = gaussian_kde(log_compute_data)
        norm_pdf = norm.pdf(x,mean,std)


    sns.kdeplot(log_compute_data,fill=True,ax=ax,label='KDE')
    ax.plot(x,norm_pdf,label='norm fit')
    ax.set_title(f'{year},nan_frac={np.round(nan_frac,1)}')


##predicted pdfs
CONST_VAR=True
tmp_year = 2021

if CONST_VAR:
    idx_tmp_year = np.where(years==tmp_year)[0][0]
    dist_var = np.mean(VAR[idx_tmp_year:])
else: 
    dist_var = None #not yet implemened

future_years = np.arange(2024,2028+1)
pred_means = mean_log_compute_model.predict(future_years.reshape(-1,1))

fig,axs=plt.subplots(nrows=3,ncols=2,figsize=(8,10))
axs_ravel = np.ravel(axs)
x_ = np.linspace(15,35)

for idx,year in enumerate(future_years):
    ax=axs_ravel[idx]
    norm_pdf = norm.pdf(x_,pred_means[idx],dist_var)
    ax.plot(norm_pdf)



fig.tight_layout()

#if not PLOT: plt.close()

##


In [None]:
###Full workflow


CONST_VAR = True #take variance from year y
tmp_year = 2021

years = np.arange(2017,2023+1)
future_years = np.arange(2024,2028+1)

#model number
start_date = '2017-01-01'
df_date_filtered = df[df['date']>start_date]
df_date_filtered['year'] = df_date_filtered['date'].dt.year
year_counts = df_date_filtered['year'].value_counts().sort_index()
year_counts = year_counts.loc[2017:2023]

years_all,pred_counts,pred_counts_UB,pred_counts_LB = exp_pred_counts(years,year_counts,future_years)

if CONST_VAR:
    idx_tmp_year = np.where(years==tmp_year)[0][0]
    dist_var = np.mean(VAR[idx_tmp_year:])
else: 
    dist_var = None #not yet implemened


threshold_count = 0
threshold = 25
x=np.linspace(22,stop=40,num=1000)
for idx,year in enumerate(future_years):
    fmt_year = np.array(year).reshape(-1,1)
    mean,var = mean_log_compute_model.predict(fmt_year),dist_var
    threshold_cdf = 1-norm.cdf(threshold,mean,var) #probability of a model being in [threshold, +inf]
    n_models_threshold = pred_counts[idx]*threshold_cdf
    threshold_count += n_models_threshold

