In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.linear_model import LinearRegression #for linear regression
from scipy.optimize import curve_fit #for exponential fit

csv_path = '/Users/iyngkarrankumar/Documents/Misc/Tracking models/data/all_systems.csv'

In [None]:
def half_year_bin(date):
    #CHATGPT generated

    if date.month <= 6:
        return f'{date.year}-H1'
    else: 
        return f'{date.year}-H2'

def year_bin(date):
    return date.year


def exponential_model(x,a,b):
    return a*np.exp(b*(x-2017))

def geometric_model(x,a,r):
    return a*r**(x-2017)

## Compute distributions

In [None]:
DATA = pd.read_csv(csv_path)
DATA['Publication date'] = pd.to_datetime(DATA['Publication date'])

#filter based on compute
DATA_ = DATA.dropna(subset=['Training compute (FLOP)'])


#data filtering and binning
start_year = 2017
DATA_f1 = DATA_[DATA_['Publication date'] > f'{start_year}-01-01']

#new column for binning
bin_type = 'year' 
if bin_type=='year':
    DATA_f1['Publication_Bin'] = DATA_['Publication date'].apply(year_bin)
elif bin_type=='half year':
    DATA_f1['Publication_Bin'] = DATA_['Publication date'].apply(half_year_bin)

#new column for log flop
DATA_f1['log10 Training compute (FLOP)'] = np.log10(DATA_f1['Training compute (FLOP)'])

#finding means
DATA_f1.groupby('Publication_Bin')['log10 Training compute (FLOP)'].mean()

In [None]:
years = list(reversed(DATA_f1['Publication_Bin'].unique()))
fig,axs = plt.subplots(nrows=len(years),ncols=1,figsize=(8,12),sharex=True)
bin_range = (15,27)
bins=np.arange(bin_range[0],bin_range[-1],1)

for idx,year in enumerate(years):
    ax = axs[idx]
    filtered_df = DATA_f1[DATA_f1['Publication_Bin']==year] #year df
    filtered_df['log10 Training compute (FLOP)'].plot(kind='hist',bins=bins,range=bin_range,edgecolor='black',ax=ax)
    ax.set_xlabel('');ax.set_ylabel('')
    ax.set_xlim(bin_range)
    ax.tick_params(axis='y',labelsize=12)
    if idx==0:     ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

    ax.set_title(f'Year {year}, n={len(filtered_df)}',fontsize=15)
    ax.grid(alpha=0.5)



fig.text(0.5, -0.04, 'Log Compute ($10^X$)', ha='center', fontsize=15)
fig.text(-0.04, 0.5, 'Frequency', va='center', rotation='vertical', fontsize=15)
plt.xticks(bins,fontsize=15)
plt.subplots_adjust(hspace=10)
plt.tight_layout(rect=[0.04, 0.04, 1, 1])



## Model number extrapolation

In [None]:
DOUBLE_2024 = False

DATA = pd.read_csv(csv_path)
DATA['Publication date'] = pd.to_datetime(DATA['Publication date']) 

#we don't filter for na training compute - that doesn't matter for models

start_year = 2017
DATA_f1 = DATA[DATA['Publication date'] > f'{start_year}-01-01'] #start after 2017

bin_type = 'year' 
if bin_type=='year':
    DATA_f1['Publication_Bin'] = DATA['Publication date'].apply(year_bin)
elif bin_type=='half year':
    DATA_f1['Publication_Bin'] = DATA['Publication date'].apply(half_year_bin)

DATA_f1['log10 Training compute (FLOP)'] = np.log10(DATA_f1['Training compute (FLOP)'])

if DOUBLE_2024: #doesn't work
    years = (sorted(DATA_f1['Publication_Bin'].unique()))
    model_counts = list(DATA_f1['Publication_Bin'].value_counts().sort_index())
    print(years,model_counts)
    model_counts[-1] = 2*model_counts[-1]
    model_counts[-1] = model_counts[-1] + (model_counts[-1]-model_counts[-2])
    future_years = np.arange(2025,2030)


else:
    years = (sorted(DATA_f1['Publication_Bin'].unique())); years.pop()
    model_counts = list(DATA_f1['Publication_Bin'].value_counts().sort_index()); model_counts.pop() #remove 2024 count
    future_years = np.arange(2024,2030)

years = np.array(years)
model_counts = np.array(model_counts)


#Linear extrapolation
linear_model = LinearRegression()
linear_model.fit(years.reshape(-1,1),model_counts)
linear_pred = linear_model.predict(future_years.reshape(-1,1))

#Polynomial extrapolation
degree = 2 
coefficients = np.polyfit(years, model_counts,degree)
polynomial = np.poly1d(coefficients)
poly_pred = polynomial(future_years)


#geometric series
popt_geometric, _ = curve_fit(geometric_model,years,model_counts)
geometric_pred = geometric_model(future_years,*popt_geometric)


In [None]:
fig,ax=plt.subplots()

ax.bar(years,model_counts,color='tab:blue')
ax.bar(future_years,geometric_pred,color='tab:red',alpha=0.8); ax.legend(fontsize=12)
ax.grid(alpha=0.5)
ax.tick_params(axis='x',labelsize=12)
ax.tick_params(axis='y',labelsize=12)
ax.set_ylabel('Models released',fontsize=15)
ax.set_xlabel('Years',fontsize=15)

## Task 1 - Extrapolating models that exceed thresholds

In [None]:
## SETUP AND REGRESSION

DATA = pd.read_csv(csv_path)
DATA['Publication date'] = pd.to_datetime(DATA['Publication date'])

#filter based on compute
DATA_ = DATA.dropna(subset=['Training compute (FLOP)'])


#data filtering and binning
start_year = 2017
DATA_f1 = DATA_[DATA_['Publication date'] > f'{start_year}-01-01']

#filter out AlphaGo and AlphaGo master
SYSTEMS_TO_REMOVE = ['AlphaGo Zero','AlphaGo Master']
DATA_f1 = DATA_f1[~DATA_f1['System'].isin(SYSTEMS_TO_REMOVE)]

#new column for binning
bin_type = 'year' 
if bin_type=='year':
    DATA_f1['Publication_Bin'] = DATA_['Publication date'].apply(year_bin)
elif bin_type=='half year':
    DATA_f1['Publication_Bin'] = DATA_['Publication date'].apply(half_year_bin)

#new column for log flop
DATA_f1['log10 Training compute (FLOP)'] = np.log10(DATA_f1['Training compute (FLOP)'])

#years
years = (sorted(DATA_f1['Publication_Bin'].unique())); years.pop()
years=np.array(years)

#finding means
historic_means = list(DATA_f1.groupby('Publication_Bin')['log10 Training compute (FLOP)'].mean()) #ignore 2024 (n=4)
historic_means.pop()

historic_var = list(DATA_f1.groupby('Publication_Bin')['log10 Training compute (FLOP)'].var())
historic_var.pop()

#finding growth rate (in terms of OOMs)
historic_means = np.array(historic_means)
mean_differences = np.diff(historic_means)
log_training_compute_linear_model = LinearRegression()
log_training_compute_linear_model.fit(years.reshape(-1,1),historic_means)



#avg var - not fitting a linear regression
var = np.mean(np.array(historic_var))


In [None]:
## GENERATING SIMULATED DATA

model_counts_fit = "poly" #exp, lin ,poly

start_year = 2024
end_year = 2029
future_years = np.arange(start_year,end_year+1,dtype=int)
if model_counts_fit=="exp":
    predicted_model_counts = (geometric_model(future_years,*popt_geometric)).astype('int')
elif model_counts_fit=="lin":
    predicted_model_counts = (linear_model.predict(future_years.reshape(-1,1))).astype('int')
elif model_counts_fit=='poly': 
    predicted_model_counts = (polynomial(future_years)).astype('int')
predicted_mean_training_compute = log_training_compute_linear_model.predict(future_years.reshape(-1,1))

assert len(future_years) == len(predicted_mean_training_compute) == len(predicted_model_counts)

historical_data = DATA_f1['log10 Training compute (FLOP)'].values
historical_data_years = DATA_f1['Publication_Bin'].values


simulated_data = {}

for idx,year in enumerate(future_years):

    #bootstrapping
    samples = np.random.choice(historical_data,size=int(predicted_model_counts[idx]),replace=True)
    sample_mean = np.mean(samples)
    sample_std = np.std(samples)

    #adjusting mean and var
    adjusted_sample_Z = (samples-sample_mean)/sample_std  #normalise sample
    adjusted_sample = adjusted_sample_Z*np.sqrt(var) + predicted_mean_training_compute[idx] #adjust mean and var

    simulated_data[year]=adjusted_sample



#combine historical and simulated data
combined_df = pd.DataFrame(columns=['year','log10 Training Compute'])
df_1 = {
    'year': historical_data_years,
    'log10 Training Compute':historical_data
}
combined_df = pd.concat([combined_df,pd.DataFrame(df_1)],ignore_index=True)

for idx,year in enumerate(future_years):
    simulated_data_year = simulated_data[year]
    year_data = {
        'year': np.full(len(simulated_data_year),int(year)),
        'log10 Training Compute':simulated_data_year
    }
    combined_df = pd.concat([combined_df,pd.DataFrame(year_data)],ignore_index=True)


#### Compute distributions (for simulated data)

In [None]:
simulated_df = combined_df[combined_df['year']>=2024]

years = sorted(list(simulated_df['year'].unique()))
fig,axs = plt.subplots(nrows=len(years),figsize=(8,12),sharex=True)
bin_range = (15,31)
bins=np.arange(bin_range[0],bin_range[-1],1)

for idx,year in enumerate(years):
    ax = axs[idx]
    filtered_df = simulated_df[simulated_df['year']==year] #year df
    filtered_df['log10 Training Compute'].plot(kind='hist',bins=bins,range=bin_range,edgecolor='black',ax=ax)
    ax.set_xlabel('');ax.set_ylabel('')
    ax.set_xlim(bin_range)
    ax.tick_params(axis='y',labelsize=12)
    
    ax.set_title(f'Year {year}, n={len(filtered_df)}',fontsize=15)
    ax.grid(alpha=0.5)



fig.text(0.5, -0.04, 'Log Compute ($10^X$)', ha='center', fontsize=15)
fig.text(-0.04, 0.5, 'Frequency', va='center', rotation='vertical', fontsize=15)
plt.xticks(bins,fontsize=15)
plt.subplots_adjust(hspace=10)
plt.tight_layout(rect=[0.04, 0.04, 1, 1])


#### Plots

In [None]:
#scatter
SCATTER=True
MODEL_COUNT=False

if SCATTER:
    plt.scatter(combined_df['year'],combined_df['log10 Training Compute'],alpha=0.5,marker='x')
    plt.xticks(np.arange(2017,2030),rotation=45)
    plt.ylabel('log10 Compute')
    plt.grid(alpha=0.4)

if MODEL_COUNT:
    years = list(sorted(combined_df.year.unique()))
    total_model_counts=np.concatenate([model_counts,predicted_model_counts],axis=0)
    plt.bar(years,total_model_counts)
    plt.xticks(np.arange(2017,2030),rotation=45)
    plt.title("Number of models released each year")
    plt.grid(alpha=0.4)


#### Counting models exceeding threshold

In [None]:
#We can now find 'past year X, how many models are these past threshold Y'
threshold = 25 #threshold in log FLOPS
years = np.array(sorted(combined_df['year'].unique()))
plot_data_25 = []
plot_data_26 = []
for year in years:
    #10^25
    date_condition = combined_df['year'] <= year
    compute_condition = combined_df['log10 Training Compute'] >= 25
    filtered_df=combined_df[date_condition & compute_condition] #how many models exist up to that year and have compute > threshold
    plot_data_25.append(len(filtered_df))

    #10^26
    compute_condition = combined_df['log10 Training Compute'] >= 26
    filtered_df=combined_df[date_condition & compute_condition] #how many models exist up to that year and have compute > threshold
    plot_data_26.append(len(filtered_df))


plt.bar(years-0.1,plot_data_25,width=0.5,label='10^25 FLOP threshold',alpha=0.8)
plt.bar(years+0.1,plot_data_26,width=0.5,label='10^26 FLOP threshold',alpha=0.8)
plt.legend()
plt.xticks(np.arange(2017,2030),rotation=45)
plt.yticks(range(0,3001,250))
plt.title(f'Number of models exceeding threshold')
plt.grid(alpha=0.4)

## Task 3 - Frontier models

In [None]:
DATA = pd.read_csv(csv_path)
DATA['Publication date'] = pd.to_datetime(DATA['Publication date'])

#filter based on compute
DATA_ = DATA.dropna(subset=['Training compute (FLOP)'])


#data filtering and binning
start_year = 2017
DATA_f1 = DATA_[DATA_['Publication date'] > f'{start_year}-01-01']

#new column for binning
bin_type = 'year' 
if bin_type=='year':
    DATA_f1['Publication_Bin'] = DATA_['Publication date'].apply(year_bin)
elif bin_type=='half year':
    DATA_f1['Publication_Bin'] = DATA_['Publication date'].apply(half_year_bin)

#new column for log flop
DATA_f1['log10 Training compute (FLOP)'] = np.log10(DATA_f1['Training compute (FLOP)'])

In [None]:
frontier_depth_1 = 1 #in terms of OOMs
frontier_depth_05 = 0.5

#REMOVE_SYSTEMS = ['AlphaGo Zero','AlphaGo Master']
REMOVE_SYSTEMS = []
DATA_f1 = DATA_f1[~DATA_f1['System'].isin(REMOVE_SYSTEMS)]

years = list(reversed(DATA_f1['Publication_Bin'].unique()))
LARGEST_RUNS = []
N_FRONTIER_SYSTEMS_1 = []
N_FRONTIER_SYSTEMS_05 = []
N_SYSTEMS = []

DF = pd.DataFrame(columns=['Year','Name','0.5 OOM','1 OOM'])

for year in years:
    filtered_df = DATA_f1[DATA_f1['Publication_Bin']<=year] #training runs that have taken place up to $year ($year included)
    largest_run_to_date = round(filtered_df['log10 Training compute (FLOP)'].max(),2)
    largest_run_index = filtered_df['log10 Training compute (FLOP)'].idxmax()
    largest_run_name = filtered_df['System'].loc[largest_run_index]
    LARGEST_RUNS.append((largest_run_name,largest_run_to_date))
    N_SYSTEMS.append(len(filtered_df))


    frontier_filtering_condition_1 = (largest_run_to_date - filtered_df['log10 Training compute (FLOP)']) < frontier_depth_1
    frontier_filtered_df_1 = filtered_df[frontier_filtering_condition_1]
    N_FRONTIER_SYSTEMS_1.append(len(frontier_filtered_df_1))

    frontier_filtering_condition_05  = (largest_run_to_date-filtered_df['log10 Training compute (FLOP)']) < frontier_depth_05
    frontier_filtered_df_05 = filtered_df[frontier_filtering_condition_05]
    N_FRONTIER_SYSTEMS_05.append(len(frontier_filtered_df_05))


N_FRONTIER_SYSTEMS_1=np.array(N_FRONTIER_SYSTEMS_1)-1 #to account for largest run
N_FRONTIER_SYSTEMS_05 = np.array(N_FRONTIER_SYSTEMS_05)-1 

DF['Year'] = years
DF['Name'] = LARGEST_RUNS
DF['1 OOM'] = N_FRONTIER_SYSTEMS_1
DF['0.5 OOM'] = N_FRONTIER_SYSTEMS_05


## Task 0 - Cumulative model counts for various thresholds

In [None]:
#prep

DATA = pd.read_csv(csv_path)
DATA['Publication date'] = pd.to_datetime(DATA['Publication date'])

#filter based on compute
DATA_ = DATA.dropna(subset=['Training compute (FLOP)'])


#data filtering and binning
start_year = 2017
DATA_f1 = DATA_[DATA_['Publication date'] > f'{start_year}-01-01']

#remove systems
SYSTEMS_TO_REMOVE = ['AlphaGo Zero','AlphaGo Master']
DATA_f1 = DATA_f1[~DATA_f1['System'].isin(SYSTEMS_TO_REMOVE)]

#new column for binning
bin_type = 'year' 
if bin_type=='year':
    DATA_f1['Publication_Bin'] = DATA_['Publication date'].apply(year_bin)
elif bin_type=='half year':
    DATA_f1['Publication_Bin'] = DATA_['Publication date'].apply(half_year_bin)

#new column for log flop
DATA_f1['log10 Training compute (FLOP)'] = np.log10(DATA_f1['Training compute (FLOP)'])


In [None]:
DOUBLE_2024 = True

years = np.array(list(reversed(DATA_f1['Publication_Bin'].unique())))
years_str = [str(year) for year in years]
thresholds = [23,24]

fig,ax=plt.subplots(figsize=(8,6),sharey=True)
LARGEST_RUNS=[]

PLOT_DATA = {key: None for key in thresholds}

for idx,threshold in enumerate(thresholds):
    ax.grid(alpha=0.4)
    plot_data = []
    for year in years:
        date_condition = DATA_f1['Publication_Bin'] <= year
        date_filtered_df = DATA_f1[date_condition]
        largest_run = date_filtered_df['log10 Training compute (FLOP)'].max()
        if idx==0:LARGEST_RUNS.append(round(largest_run,1))

        threshold_condition = DATA_f1['log10 Training compute (FLOP)'] >= threshold
        filtered_df = DATA_f1[date_condition & threshold_condition]

        data_point = len(filtered_df)
        if year==2024 and DOUBLE_2024:
            exceeding_threshold_2024 = data_point-plot_data[-1]
            data_point = data_point + 2*exceeding_threshold_2024 #to get a full year of 2024 data out

        

        plot_data.append(data_point)



    PLOT_DATA[threshold] = np.array(plot_data)


    if 0: 
        ax.bar(years,plot_data)
        ax.set_title(f'Threshold: 10^{threshold} FLOPs')
        ax.set_xticklabels(years,rotation=45,fontsize=12)
        if idx==0: ax.set_ylabel('N_systems',fontsize=12,rotation=0)
if 1: 
    ax.bar(years-0.1,PLOT_DATA[23], width=0.5, label=f'Threshold: 10^23 FLOPs',color='tab:blue',alpha=0.8)
    ax.bar(years+0.1,PLOT_DATA[24], width=0.5, label=f'Threshold: 10^24 FLOPS',color='tab:orange',alpha=0.8)
    ax.set_yticks(np.arange(0,130,10))
    ax.legend()
    fig.suptitle('Number of models exceeding thresholds')
    #custom_xticks = [f'{year} \n {run}' for year,run in zip(years,LARGEST_RUNS)]
    #ax.set_xticklabels(custom_xticks,fontsize=10)


plt.subplots_adjust(wspace=1)
plt.tight_layout()

    


In [None]:
DOUBLE_2024 = True
REMOVE_2024 = False

##threshold 
threshold=23

## extrapolating
years_arr =np.array(years)

start_year=2025
end_year=2029
future_years = np.arange(start_year,end_year+1)
##we go from the first year in which number of systems that exceeds is greater than 0
nonzero_idxs = PLOT_DATA[threshold] != 0
counts_filtered = PLOT_DATA[threshold][nonzero_idxs]
years_filtered = years_arr[nonzero_idxs]


if DOUBLE_2024: 
    assert not REMOVE_2024
    exceeding_threshold_24 = counts_filtered[-1] - counts_filtered[-2] #how many models released so far in 2024 that exceed threshold
    counts_filtered[-1] = counts_filtered[-1] + 2*exceeding_threshold_24
    print(counts_filtered)

if REMOVE_2024:
    counts_filtered = counts_filtered[:-1]
    years_filtered = years_filtered[:-1]

X_0 = years_filtered[0]
def geometric_model(x,a,r):
    return a*r**(x-X_0)


popt_geometric,_ = curve_fit(geometric_model,years_filtered,counts_filtered)
geometric_pred = (geometric_model(future_years,*popt_geometric)).astype('int')


fig,ax=plt.subplots()
ax.bar(years_filtered,counts_filtered,color='tab:blue')
ax.bar(future_years,geometric_pred,color='tab:red',alpha=0.8)
ax.grid(alpha=0.5)
ax.tick_params(axis='x',labelsize=12)
ax.tick_params(axis='y',labelsize=12)
ax.set_title(f'Models exceeding 10^{threshold} threshold',fontsize=15)
ax.set_xlabel('Years',fontsize=15)
ax.set_yscale('linear')

## ARIMA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.linear_model import LinearRegression #for linear regression
from scipy.optimize import curve_fit #for exponential fit
import statsmodels

csv_path = '/Users/iyngkarrankumar/Documents/Misc/Tracking models/data/all_systems.csv'

In [None]:
DATA = pd.read_csv(csv_path)
DATA['Publication date'] = pd.to_datetime(DATA['Publication date'])

DATA = DATA.dropna(subset=['Training compute (FLOP)'])

start_year = 2017
DATA = DATA[DATA['Publication date'] > f'{start_year}-01-01']

#new column for binning
bin_type = 'year' 
if bin_type=='year':
    DATA['Publication_Bin'] = DATA['Publication date'].apply(year_bin)
elif bin_type=='half year':
    DATA['Publication_Bin'] = DATA['Publication date'].apply(half_year_bin)

DATA['log10 Training compute (FLOP)'] = np.log10(DATA['Training compute (FLOP)'])


## Latent variables

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

csv_path = '/Users/iyngkarrankumar/Documents/Misc/Tracking models/data/all_systems.csv'

In [None]:
DATA = pd.read_csv(csv_path)
DATA['Publication date'] = pd.to_datetime(DATA['Publication date'])
start_year = 2017
DATA = DATA[DATA['Publication date'] > f'{start_year}-01-01']

bin_type = 'year' 
if bin_type=='year':
    DATA['Publication_Bin'] = DATA['Publication date'].apply(year_bin)
elif bin_type=='half year':
    DATA['Publication_Bin'] = DATA['Publication date'].apply(half_year_bin)

In [1]:

REMOVE_2024 = True

years = sorted(DATA['Publication_Bin'].unique())
years = years[:-1]

STORE = {year:None for year in years}

MODEL_COUNTS = []
N_DEVELOPERS = []
TOP_N = 10
TOP_N_MED_RELEASE_FREQ = []
MEAN_RELEASE_FREQ = []

for year in years:
    DATA_year_filtered = DATA[DATA['Publication_Bin']==year]
    MODEL_COUNTS.append(len(DATA_year_filtered))
    ORGANISATION_year = list(DATA_year_filtered['Organization'])
    split_list = [item for sublist in ORGANISATION_year for item in sublist.split(',')]
    organisation_frequency = dict(Counter(split_list))
    STORE[year] = organisation_frequency
    N_DEVELOPERS.append(len(organisation_frequency.keys()))

    release_counts_sorted = np.sort(list(organisation_frequency.values()))
    TOP_N_MED_RELEASE_FREQ.append(np.int32(np.median(release_counts_sorted[-TOP_N:])))
    MEAN_RELEASE_FREQ.append(np.mean(release_counts_sorted))


fig,axs = plt.subplots(nrows=3,sharex=True)
axs[0].scatter(years,MODEL_COUNTS)
axs[1].scatter(years,N_DEVELOPERS)
axs[2].scatter(years,TOP_N_MED_RELEASE_FREQ)