## Setup and preprocess df

In [None]:
import numpy as np
from scipy import stats, optimize
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import itertools
import copy,re
%matplotlib inline

In [None]:
# df = pd.read_csv("https://epochai.org/data/epochdb/notable_systems.csv")
url = 'https://drive.google.com/file/d/1RLLKPU3bEYK65wlQlU0p20u9M8cHkLMl/view?usp=sharing'
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]

df = pd.read_csv(url)

df = df[~df["Notability criteria"].isna()]

df["compute"] = df["Training compute (FLOP)"]
df["date"] = df["Publication date"]
df["model"] = df["System"]
df["poss1e23"] = df["Possibly over 1e23 FLOP"]
df["poss1e25"] = df["Estimated over 1e25 FLOP"]
df["cost"] = df["Training compute cost (2023 USD)"]
df["cost"] = df["cost"].str.replace(",", "").str.replace("$", "").astype(float)

df = df[["model", "compute", "date", "cost", "poss1e23", "poss1e25"]]

In [None]:
to_remove = ['AlphaGo Zero','AlphaZero']
df = df[~df["model"].isin(to_remove)]

In [None]:
to_append = [
  ["Claude 3.5 Sonnet", 4.3e25, "2024-06-21", np.nan, np.nan, np.nan],
  ["GPT-4o Mini", 1.2e25, "2024-07-18", np.nan, np.nan, np.nan],
]

for row in to_append:
  if row[0] not in df["model"].values:
    df.loc[len(df)] = row

In [None]:
to_add_compute = {
    "Claude 3 Opus": 2.5e25,
    "Claude 3 Sonnet": 1.1e25,
    "GPT-4o": 2.9e25,
    "Gemini 1.0 Pro": 2.8e24,
    "Gemini 1.5 Pro": 1.9e25,
    "Reka Core": 8.4e24,
    "GPT-4 Turbo": 2.1e25,  # rough guess
    "GPT-4V": 2.1e25,  # rough guess
    "Claude 2.1": df[df["model"]=="Claude 2"]["compute"].values,  # rough guess
}

for k, v in to_add_compute.items():
  if df.loc[df["model"] == k, "compute"].isna().values:
    df.loc[df["model"] == k, "compute"] = v
  else:
    print(f"{k} already has a compute value")

In [None]:
# Reset the ones we've set
df.loc[~df["compute"].isna(), "poss1e23"] = np.nan
df.loc[~df["compute"].isna(), "poss1e25"] = np.nan

# Set some temporary placeholder values
# TODO: revisit
# df.loc[(df["poss1e25"] == "checked"), "compute"] = 1.01e25  # placeholder
# df.loc[((df["poss1e23"] == "checked") & (df["poss1e25"] != "checked")), "compute"] = 1.01e23  # placeholder

# We want to handle these leading models manually via the above compute estimates.
assert df[(df["poss1e25"] == "checked") & (df["compute"].isna())].size == 0

# We sample 1e23-1e25 models with unknown compute from the existing empirical distribution.
# TODO: revisit
poss1e23 = ((df["poss1e23"] == "checked") & (df["poss1e25"] != "checked"))
df.loc[poss1e23, "compute"] = df[(df["compute"] >= 1e23) & (df["compute"] < 1e25)]["compute"].sample(poss1e23.sum(), random_state=0).values

df["date"] = pd.to_datetime(df["date"])
df["log_compute"] = np.log10(df["compute"])

df["date_float"] = df["date"].dt.year + df["date"].dt.month/12

df['year'] = df['date'].dt.year

df = df.sort_values("date")
df.dropna(subset="compute", inplace=True)

In [None]:
fig = sns.scatterplot(data=df[df['date']>'2010-01-01'], x='date',y='compute')
fig.set(yscale='log')
plt.grid(alpha=0.5)
%matplotlib inline

## Analysis class

In [None]:
## utils


def exponential_fit(x,a,b):
    return a*np.exp(b*x)

def x_transform_for_exp_fit(ref_x,x,inverse=False):
    
    '''
    Transform timestamps to ~ interval [0,50] for stable exp fit.
    Can also do inverse
    '''

    norm_const = ref_x.min()

    if not inverse:
        transformed_x = (x-norm_const)/1e7 #normalising x values
    else:
        transformed_x = 1e7*x + norm_const 

    return transformed_x

In [None]:
from sklearn.mixture import GaussianMixture; random_seed=42
from scipy.stats import norm
from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit
import pwlf


from dataclasses import dataclass

@dataclass
class AnalysisConfig:
    fit_start_date: str = '2017-01-01'
    fit_stop_date: str = '2024-01-01'
    predict_start_date: str = '2024-01-01'
    predict_stop_date: str = '2030-01-01'

class DataAnalysis():

    def __init__(self,df,window_freq='quarter'):

        self.df = df
        self.df['date'] = pd.to_datetime(self.df['date'])
        self.working_df = None

        self.start_time = '2017-01-01'
        self.stop_time = '2024-01-01'
        self.predict_start_time = '2024-01-01'
        self.predict_stop_time = '2030-01-01'
        self.window_freq = window_freq
        self.window_size = 'year'

        if self.window_freq=='quarter':
            times = pd.date_range(self.start_time,self.stop_time,freq='QS')
            predict_times = pd.date_range(self.predict_start_time,self.predict_stop_time,freq='QS') 
        elif self.window_freq=='biannual':
            times = pd.date_range(start=self.start_time,end=self.stop_time,freq='6MS')[1:-1] #indexing filters out startyear-01-01, endyear-01-01
            predict_times = pd.date_range(self.predict_start_time,self.predict_stop_time,freq='6M')[1:-1] 
        elif self.window_freq=='year':
            times = pd.date_range(start=self.start_time,end=self.stop_time,freq='AS-JUL')
            predict_times = pd.date_range(self.predict_start_time,self.predict_stop_time,freq='AS-JUL')
        else:
            raise ValueError('')
        
        if self.window_size=='year':
            times_lb = times - pd.DateOffset(months=6)
            times_ub = times + pd.DateOffset(months=6)

        #can use these to quickly filter df and get 
        self.window_times = times
        self.window_times_lb = times_lb
        self.window_times_ub = times_ub

        self.predict_times = predict_times

    def time_truncate_df(self,start='2017-01-01',end='2024-01-01'):

        self.working_df = self.df[(self.df['date']>'2017-01-01') & (self.df['date']<'2024-01-01')]

    def fit_distributions(self,fit_type,plot=False):

        '''
        May want to look at doing a fit to the rolling windows
        
        '''

        FIT_TYPES = ['gaussian','gaussian mixture']
        if fit_type not in FIT_TYPES:
            raise ValueError(f'Invalid fit_type. Types: {FIT_TYPES}') 
        self.fit_type=fit_type

        params = {t:None for t in self.window_times}


        if fit_type=='gaussian':
            

            for t,t_lb,t_ub in list(zip(self.window_times,self.window_times_lb,self.window_times_ub)):
                date_filt_condition = (self.working_df['date']>=t_lb) & (self.working_df['date'] < t_ub)
                date_filt_df = self.working_df[date_filt_condition]
                log_compute_data = date_filt_df['log_compute']

                mean = log_compute_data.mean()
                std = 1 #simple for now
                params[t] = {'mean':mean,'std':std}

                if plot: 
                    fig,ax=plt.subplots()
                    plus_minus = "\u00B1"
                    sns.kdeplot(log_compute_data,label=f'timestamp: {t.date()} {plus_minus} 6mo ',linewidth=2,ax=ax)
  
                    mean = log_compute_data.mean()
                    std = np.sqrt(log_compute_data.var()) #simple for now
                    x=np.linspace(10,30,1000)
                    ax.plot(x,norm.pdf(x,loc=mean,scale=std))
                    ax.grid(); ax.legend(loc='upper left')



        
        if fit_type == 'gaussian mixture':
            params = {year:{} for year in years_to_fit}
            for year in years_to_fit:
                log_compute_data = self.working_df[self.working_df['year']==year]['log_compute']
                gmm = GaussianMixture(n_components=2,random_state=random_seed)
                gmm.fit(log_compute_data.to_numpy().reshape(-1,1))
                means,covariances = gmm.means_,gmm.covariances_
                params[year]['means']=means
                params[year]['covars']=covariances
        
        self.fitted_params = params

        return params
    
    def extrapolate_distributions(self):
        
        if self.fit_type=='gaussian':
            
            #linear extrap means
            fit_dates = [t for t in self.fitted_params.keys()]
            fit_dates_float = np.array([t.timestamp() for t in fit_dates])
            means = np.array([self.fitted_params[t]['mean'] for t in self.fitted_params.keys()])
            predicted_dates_float = np.array([t.timestamp() for t in self.predict_times])


            model=LinearRegression()
            model.fit(fit_dates_float.reshape(-1,1),means)
            predicted_means = model.predict(predicted_dates_float.reshape(-1,1))

            #sample std
            std_bounds = (1.1,1.6)
            predicted_stds = np.random.uniform(low=std_bounds[0],high=std_bounds[1],size=(predicted_means.shape))

            predicted_params = {t:(mu,std) for t,mu,std in list(zip(self.predict_times,predicted_means,predicted_stds))}

        elif self.fit_type=='gaussian mixture':
            pass
        else:
            pass

        self.predicted_params = predicted_params
        self.distribution_parameter_model = model

        return predicted_params
    
    
    def model_counts(self,counts_fit_type):

        COUNT_FIT_TYPES=['linear','exponential','kinked linear']

        if counts_fit_type not in COUNT_FIT_TYPES: raise ValueError(f'Expected fit in {COUNT_FIT_TYPES}')
        
        if 0:
            #get model counts
            start_time = '2017-01-01'
            stop_time = '2024-01-01'
            if window_freq=='quarter':
                times = pd.date_range(start_time,stop_time,freq='QE') + pd.Timedelta(days=1)
            elif window_freq=='biannual':
                times = pd.date_range(start_time,stop_time,freq='2QE') + pd.Timedelta(days=1)
            elif window_freq=='year':
                times = pd.date_range(start_time,stop_time,freq='4QE') + pd.Timedelta(days=1)
            else:
                raise ValueError('')
            
            if window_size=='year':
                times_lb = times - pd.DateOffset(months=6)
                times_ub = times + pd.DateOffset(months=6)

        
        #time_data is bad var name but leftover from old code
        time_data = {t:{'size':None,
                   } 
                   for t in self.window_times}
        
        for t,t_lb,t_ub in list(zip(self.window_times,self.window_times_lb,self.window_times_ub)):

            if t_lb < pd.Timestamp(self.start_time) or t_ub > pd.Timestamp(self.stop_time): 
                print(f'Skipping {t} - window not in range')
                time_data[t]['size']=None

                continue
            else:
                date_filt_condition = (self.working_df['date']>=t_lb) & (self.working_df['date'] < t_ub)
                date_tmp_df = self.working_df[date_filt_condition] #filtered df
                time_data[t]['size']=date_tmp_df.shape[0]

        #perform fitting
    
        fit_counts = np.array([t['size'] for t in time_data.values()])
        fit_times_float = np.array([t.timestamp() for t in self.window_times])
        predict_times_float = np.array([t.timestamp() for t in self.predict_times])
 
        if counts_fit_type=='linear':
            model = LinearRegression()
            model.fit(fit_times_float.reshape(-1,1),fit_counts)
            predicted_counts = model.predict(predict_times_float.reshape(-1,1))
            retr_counts = model.predict(fit_times_float.reshape(-1,1)).astype('int')

        elif counts_fit_type=='exponential':
            transformed_fit_x = x_transform_for_exp_fit(ref_x=fit_times_float,x=fit_times_float)
            popt,pcov = curve_fit(exponential_fit,transformed_fit_x,fit_counts)    
            a,b = popt
            transformed_pred_x = x_transform_for_exp_fit(ref_x=fit_times_float,x=predict_times_float)
            predict_counts = exponential_fit(transformed_pred_x,a=a,b=b)
            retr_counts = exponential_fit(transformed_fit_x,a=a,b=b)
        elif counts_fit_type=='kinked linear':
            model = pwlf.PiecewiseLinFit(fit_times_float,fit_counts)
            breakpoints = model.fit(2)
            predict_counts = model.predict(predict_times_float)
            retr_counts = model.predict(fit_times_float)

        else:
            pass

        
        #set state vars
        self.fit_counts = fit_counts
        self.predicted_counts = predicted_counts.astype('int')
        self.count_fit_type = counts_fit_type
        self.retr_counts = retr_counts
        self.counts_model = model

        return predicted_counts
    
    def count_threshold_models(self):

        thresholds = np.arange(23,30+1)
        future_years = list(self.predict_times.year)
        threshold_counts_df = pd.DataFrame(columns=thresholds,index=future_years)

        #not doing rollouts yet
        for pred_year,params,counts in list(zip(future_years,self.predicted_params.values(),self.predicted_counts)):
            mu,sigma = params
            if self.fit_type == 'gaussian':
                log_compute_samples = norm.rvs(loc=mu,scale=sigma,size=counts)
                for thr in thresholds:
                    n_exceed = log_compute_samples[log_compute_samples>=thr].size
                    threshold_counts_df.at[pred_year,thr] = n_exceed

        self.threshold_counts = threshold_counts_df
        return threshold_counts_df
    
    
    def verify_with_retrodiction(self,n_years_retr):

        '''
        Params:
            n_years_retr: Retrodict n years back

        Return:

        Notes:
            - Don't think this is adapted for rolling windows yet (?)
        
        '''

        past_years = self.window_times[-1*n_years_retr:].year
        thresholds = [23,24]
        predicted_past_counts = pd.DataFrame(index=past_years,columns=thresholds)
        observed_past_counts = pd.DataFrame(index=past_years,columns=thresholds)
        percent_error_df = pd.DataFrame(np.nan,index=past_years,columns=thresholds)


        retrodict_times = self.window_times[-1*n_years_retr:]
        retrodict_times_float = np.array([t.timestamp() for t in retrodict_times])
        retr_counts = self.retr_counts

        #pretty inefficient way to do it right now
        for idx,t in enumerate(retrodict_times):

            ##generate distributions and counts
            if self.fit_type=='gaussian':
                mean = self.distribution_parameter_model.predict(retrodict_times_float[idx].reshape(-1,1))
                std = self.working_df[self.working_df['year']==t.year]['log_compute'].std() #just get empirical std
            if self.count_fit_type=='linear':
                count = self.counts_model(retrodict_times_float[idx].reshape(-1,1))
                count = int(count)

            ##generate pred log compute data
            log_compute_data = norm.rvs(loc=mean,scale=std,size=count)

            ##get obs log compute data
            obs_log_compute_data = self.working_df[self.working_df['year']==t.year]['log_compute']

            ##do threshold counts
            for thr in thresholds:
                #pred
                thr_count_pr = log_compute_data[log_compute_data>=thr].size
                predicted_past_counts.at[t.year,thr] = thr_count_pr

                #obs
                thr_count_ob = obs_log_compute_data[obs_log_compute_data>=thr].size
                observed_past_counts.at[t.year,thr] = thr_count_ob

        abs_diff = np.abs(observed_past_counts-predicted_past_counts)
        obs_df_safe = observed_past_counts.replace(0,np.nan) #for safe division
        percent_error_df = (abs_diff/obs_df_safe)*100
             
             
        self.predicted_past_counts = predicted_past_counts
        self.observed_past_counts = observed_past_counts



        return predicted_past_counts,observed_past_counts,percent_error_df
    
    def verify_with_training_compute(self):
         
         return None
    

In [None]:
tmp_df = df[(df['date']>'2017-01-01') & (df['date']<'2024-01-01')]
tmp_df['date'] = pd.to_datetime(tmp_df['date'])

window_freq = 'year'
n_year_retr=4

analysis = DataAnalysis(df=tmp_df,window_freq=window_freq)
analysis.time_truncate_df()
params = analysis.fit_distributions(fit_type='gaussian',plot=False)
predicted_params = analysis.extrapolate_distributions()
predicted_counts = analysis.model_counts(counts_fit_type='linear')
threshold_counts = analysis.count_threshold_models()
pred_past_counts,obs_past_counts,percent_error_df = analysis.verify_with_retrodiction(n_years_retr=n_year_retr)

In [None]:
n_years_retr = 4 

past_years = analysis.window_times[-1*n_years_retr:].year
thresholds = [23,24]
predicted_past_counts = pd.DataFrame(index=past_years,columns=thresholds)
observed_past_counts = pd.DataFrame(index=past_years,columns=thresholds)

retrodict_times = analysis.window_times[-1*n_years_retr:]
retrodict_times_float = np.array([t.timestamp() for t in retrodict_times])

retrodict_params = {t:None for t in retrodict_times}


for idx,t in enumerate(retrodict_times):

    ##generate distributions and counts
    if analysis.fit_type=='gaussian':
        mean = analysis.distribution_parameter_model.predict(retrodict_times_float[idx].reshape(-1,1))
        std = tmp_df[tmp_df['year']==t.year]['log_compute'].std() #just get empirical std
    if analysis.count_fit_type=='linear':
        count = analysis.counts_model.predict(retrodict_times_float[idx].reshape(-1,1)).item()
        count = int(count)

    ##generate pred log compute data
    log_compute_data = norm.rvs(loc=mean,scale=std,size=count)

    ##get obs log compute data
    obs_log_compute_data = tmp_df[tmp_df['year']==t.year]['log_compute']

 
    ##do threshold counts
    for thr in thresholds:
        #pred
        thr_count_pr = log_compute_data[log_compute_data>=thr].size
        predicted_past_counts.at[t.year,thr] = thr_count_pr

        #obs
        thr_count_ob = obs_log_compute_data[obs_log_compute_data>=thr].size
        observed_past_counts.at[t.year,thr] = thr_count_ob

             


