In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

all_systems_csv_path = "/Users/iyngkarrankumar/Documents/Misc/Tracking models/data/all_systems.csv"
notable_models_csv_path = "data/notable_ai_models.csv"
large_scale_models_csv_path = "data/large_scale_ai_models.csv"

In [None]:
def half_year_bin(date):
    #CHATGPT generated
    if date.month <= 6:
        return f'{date.year}-H1'
    else: 
        return f'{date.year}-H2'

def year_bin(date):
    return date.year

## Exploring large-scale model distributions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

all_systems_csv_path = "/Users/iyngkarrankumar/Documents/Misc/Tracking models/data/all_systems.csv"
notable_models_csv_path = "data/notable_ai_models.csv"
large_scale_models_csv_path = "data/large_scale_ai_models.csv"


In [None]:
LARGE_SCALE_DATA = pd.read_csv(large_scale_models_csv_path)

LARGE_SCALE_DATA['Publication date'] = pd.to_datetime(LARGE_SCALE_DATA['Publication date'])
start_year = 2017
LARGE_SCALE_DATA = LARGE_SCALE_DATA[LARGE_SCALE_DATA['Publication date'] > f'{start_year}-01-01']

#binning
LARGE_SCALE_DATA['Publication bin'] = LARGE_SCALE_DATA['Publication date'].apply(year_bin)

COMPUTE_DATA = LARGE_SCALE_DATA.dropna(subset=['Training compute (FLOP)'])
COMPUTE_DATA['log Training compute'] = np.log10(COMPUTE_DATA['Training compute (FLOP)'])

In [None]:
DOUBLE_2024 = True

years = list(reversed(COMPUTE_DATA['Publication bin'].unique()))
years.insert(1,2018)
years.insert(2,2019)

fig,axs = plt.subplots(nrows=len(years),ncols=1,figsize=(7,11),sharex=True)
bin_range = (22,27)
bins=np.arange(bin_range[0],bin_range[-1],1)

HISTOGRAM_DATA_DF = pd.DataFrame(columns=years)
for bin in bins[:-1]:
    HISTOGRAM_DATA_DF.loc[f'{bin}-{bin+1}'] = [np.nan]*len(years)

for idx,year in enumerate(years):
    ax = axs[idx]
    filtered_df = COMPUTE_DATA[COMPUTE_DATA['Publication bin']==year] #year df
    
    if year==2024 and DOUBLE_2024: filtered_df = pd.concat([filtered_df,filtered_df],ignore_index=True)

    #histogram data
    bin_count = np.histogram(filtered_df['log Training compute'],bins=bins,range=bin_range)[0]
    for idx,bin in enumerate(bins[:-1]): #-1 index to sort out indexing problems
        HISTOGRAM_DATA_DF.loc[f'{bin}-{bin+1}',year] = int(bin_count[idx])

    #plot
    filtered_df['log Training compute'].plot(kind='hist',bins=bins,range=bin_range,edgecolor='black',ax=ax)
    ax.set_xlabel('');ax.set_ylabel('')
    ax.set_xlim(bin_range)
    ax.tick_params(axis='y',labelsize=12)
    #if idx==0:     ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

    ax.set_title(f'Year {year}, n={len(filtered_df)}',fontsize=15)
    ax.grid(alpha=0.5)
    ax.set_ylim([0,40])



fig.text(0.5, -0.04, 'Log Compute ($10^X$)', ha='center', fontsize=15)
fig.text(-0.04, 0.5, 'Frequency', va='center', rotation='vertical', fontsize=15)
plt.xticks(bins,fontsize=15)
plt.subplots_adjust(hspace=10)
plt.tight_layout(rect=[0.04, 0.04, 1, 1])



## Fitting gamma distributions

In [None]:
FIT_NORMED = True #this gives way better fits than fitting the unnormalised data
PLOT_PROBS = False
MANUAL_FIT = True
MANUAL_PARAMS = { 
    'params_2022':(0.5,0,0.15),
    'params_2023':(0.35,0,0.20),
    'params_2024':(0.25,0,0.35),
}
#k=1 yields exponential distribution
stt=0.2

''' Good set of manual params:
MANUAL_PARAMS = {
    'params_2022':(0.5,0,0.15),
    'params_2023':(0.35,0,0.20),
    'params_2024':(0.25,0,0.35),
}

others:
{ 
    'params_2022':(1,0,0.025),
    'params_2023':(1,0,0.030),
    'params_2024':(1,0,0.025),
}
'''

years_to_fit = [2022,2023,2024] #just from eyeballing plots for now
NORMED_FIT_DATA = [] #to store normed fit data for later plotting
NORM_FACTORS = []
ALPHA = [] #to store shape parameter
THETA = [] #to store scale parameter
const = 1e-4

'''
manual fitting for 2022:
    alpha = 0.5, theta = 0.15
'''


for year in years_to_fit:

    #adding values
    fit_data = HISTOGRAM_DATA_DF[year]
    fit_data = fit_data.drop('22-23')
    new_data = pd.Series([0,0,0],index=['26-27','27-28','28-29'])
    fit_data = pd.concat([fit_data,new_data])

    #data structure processing
    fit_data_arr = np.array(fit_data) + const
    norm_factor = fit_data_arr.sum(); NORM_FACTORS.append(norm_factor)
    normed_fit_data = (fit_data_arr/norm_factor)+const #const to prevent fit errors
    NORMED_FIT_DATA.append(normed_fit_data)

    if FIT_NORMED: #seems to produce a way better fit
        shape,_,scale = stats.gamma.fit(normed_fit_data,floc=0) 
    else: 
        shape,_,scale = stats.gamma.fit(fit_data_arr,floc=0) 

    ALPHA.append(shape)
    THETA.append(scale)

x_labels = fit_data.index


if MANUAL_FIT:
    params_2022 = MANUAL_PARAMS['params_2022']
    params_2023 = MANUAL_PARAMS['params_2023']
    params_2024 = MANUAL_PARAMS['params_2024']
else: 
    params_2022 = [ALPHA[0],0,THETA[0]]
    params_2023 = [ALPHA[1],0,THETA[1]]
    params_2024 = [ALPHA[2],0,THETA[2]]
PARAMS = [params_2022,params_2023,params_2024]


fig,axs=plt.subplots(ncols=3,figsize=(12,6),sharey=True)
for idx,ax in enumerate(axs):
    #ax.hist(normed_fit_data)

    x=np.linspace(stt,stt+1,len(fit_data)) #don't start at 0 to avoid infs
    ax.set_xticks(x)



    if PLOT_PROBS:
        ax.bar(x,NORMED_FIT_DATA[idx],width=0.1,alpha=0.5,color='g',label='Data') #plot data


        fitted_pdf = stats.gamma.pdf(x,*PARAMS[idx])

        ax.plot(x,fitted_pdf,'r-',lw=2,label='Fitted gamma distribution')

        ax.set_xticklabels(x_labels,rotation=45)
        if idx==1: ax.legend()
        if idx==0: ax.set_ylabel('Probability')

    else: 
        ax.bar(x,NORM_FACTORS[idx]*NORMED_FIT_DATA[idx],width=0.1,alpha=0.5,color='g',label='Data') #plot data

        fitted_pdf = stats.gamma.pdf(x,*PARAMS[idx])
        ax.plot(x,NORM_FACTORS[idx]*fitted_pdf,'r-',lw=2,label='Fitted gamma distribution')

        ax.set_xticklabels(x_labels,rotation=45)
        if idx==1: ax.legend()
        if idx==0: ax.set_ylabel('Frequency')

    ax.set_title(f'Year: {years_to_fit[idx]}')

fig.suptitle('Gamma distribution frequency predictions')



ALPHA_FIT = [p[0] for p in PARAMS]
THETA_FIT = [p[-1] for p in PARAMS]


In [None]:
from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression

MODEL_COUNTS_FIT = 'linear'

x_ = (np.arange(0,len(years_to_fit))).reshape(-1,1)
extrap_start_year = 2025
extrap_end_year = 2029
future_years = np.arange(extrap_start_year,extrap_end_year+1)
MAPPED_future_years = (future_years - 2022).reshape(-1,1) #we set year 2022 as x_ = 0


#model counts
#COULD fit data from 2020 - 2024 for model counts
model_counts = [int(elem) for elem in NORM_FACTORS]
if MODEL_COUNTS_FIT=='exponential':
        def exponential_model(x,a,b,c):
            return a*np.exp(b*x) + c
        p0 = (1,0,1)
        popt,pcov = curve_fit(exponential_model,x_.flatten(),np.array(model_counts))
        predicted_counts = exponential_model(MAPPED_future_years,*popt)
elif MODEL_COUNTS_FIT=='linear':
    lin_counts_model = LinearRegression()
    lin_counts_model.fit(x_,model_counts)
    predicted_counts = (lin_counts_model.predict(MAPPED_future_years)).astype('int')
else: raise Exception(f'Unknown fit entered: {MODEL_COUNTS_FIT}')

##Parameter extrapolation
#we're fixing alpha and going to assume linear regression for beta

def exp_func(x,a,b):
    return a*np.exp(-b*x)
popt,_ = curve_fit(exp_func,x_.flatten(),ALPHA_FIT)
PRED_ALPHA = exp_func(MAPPED_future_years.flatten(),*popt)


theta_linear_model = LinearRegression()
theta_linear_model.fit(x_,THETA_FIT)
PRED_THETA = theta_linear_model.predict(MAPPED_future_years)


if 1:
    #getting predictions for future years
    PREDICTED_HISTOGRAM_DATA_DF = pd.DataFrame(index=x_labels,columns=future_years)
    #PREDICTED_HISTOGRAM_DATA_DF.set_index(pd.Index(x_labels),inplace=True)

    for idx,year in enumerate(future_years):
        pdf_predictions = stats.gamma.pdf(x,np.mean(ALPHA_FIT),loc=0,scale=PRED_THETA[idx]) #
        frequency_predictions = predicted_counts[idx]*pdf_predictions
        PREDICTED_HISTOGRAM_DATA_DF[year] = frequency_predictions.astype('int')


## Fitting truncated exponentials

In [None]:
def truncated_exponential_log_likelihood(data,lambda_,t):
    log_pdf = np.log(lambda_) - lambda_*data
    cdf_value = 1-np.exp(-lambda_* t)
    log_likelihood = np.sum(log_pdf) - len(data)*np.log(cdf_value)

    return -log_likelihood #we seek to minimise negative LL

def fit_truncated_exponential(data,threshold,lambda_0=1):
    #lambda 0 is initial guess
    return None
    

def test_fit():
    #generate some sample data and check if the optimisation finds the right parameters
    pass

In [None]:
years_to_fit = [2022,2023,2024]
frontier_bins = [24,25,25] #number is bin lower bound. So no probability mass assigned to bin > (item+1)

start_year = 2025
end_year = 2029
future_years = np.arange(start_year,end_year+1,1)
future_frontier_bins = [25,26,26,27,27] #just fixing for now
assert len(future_years)==len(future_frontier_bins)

NORMED_FIT_DATA = []
NORM_FACTORS = []
LAMBDA = [] #exponential distribution rate parameter
const = 1e-4

for year in years_to_fit: 
    #adding values
    fit_data = HISTOGRAM_DATA_DF[year]
    fit_data = fit_data.drop('22-23')
    new_data = pd.Series([0,0,0],index=['26-27','27-28','28-29'])
    fit_data = pd.concat([fit_data,new_data])

    #data structure processing
    fit_data_arr = np.array(fit_data) + const
    norm_factor = fit_data_arr.sum(); NORM_FACTORS.append(norm_factor)
    normed_fit_data = (fit_data_arr/norm_factor)+const #const to prevent fit errors
    NORMED_FIT_DATA.append(normed_fit_data)

    #fit truncated exponential
    lambda_ =  fit_truncated_exponential(normed_fit_data)
    LAMBDA.append(lambda_)

#PLOTTING
x_labels = fit_data.index