# Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression #for linear regression
from scipy.optimize import curve_fit #for exponential fit

random_seed = 42
np.random.seed(random_seed)

In [None]:
def half_year_bin(date):
    #CHATGPT generated

    if date.month <= 6:
        return f'{date.year}-H1'
    else: 
        return f'{date.year}-H2'

def year_bin(date):
    return date.year

In [None]:
csv_path = '/Users/iyngkarrankumar/Documents/Misc/Tracking models/data/all_systems.csv'
DATA = pd.read_csv(csv_path)
start_year = 2017
DATA['Publication date'] = pd.to_datetime(DATA['Publication date'])
DATA = DATA[DATA['Publication date'] > f'{start_year}-01-01'] #data filtering

bin_type = 'year'
if bin_type=='year':
    DATA['Publication bin'] = DATA['Publication date'].apply(year_bin)
elif bin_type=='half year':
    DATA['Publication bin'] = DATA['Publication date'].apply(half_year_bin)

# Extrapolating model number

In [None]:
def geometric_model(x,a,r):
    return a*r**(x-2017)

In [None]:
DOUBLE_2024 = False

if DOUBLE_2024:
    years = (sorted(DATA['Publication bin'].unique()))
    model_counts = list(DATA['Publication bin'].value_counts().sort_index())
    print(years,model_counts)
    model_counts[-1] = 2*model_counts[-1]
    future_years = np.arange(2025,2030)
else:
    years = (sorted(DATA['Publication bin'].unique())); years.pop()
    model_counts = list(DATA['Publication bin'].value_counts().sort_index()); model_counts.pop() #remove 2024 count
    future_years = np.arange(2024,2030)

years = np.array(years)
model_counts = np.array(model_counts)

#Linear extrapolation
linear_model = LinearRegression()
linear_model.fit(years.reshape(-1,1),model_counts)
linear_pred = linear_model.predict(future_years.reshape(-1,1))

#Polynomial extrapolation
degree = 2 
coefficients = np.polyfit(years, model_counts,degree)
polynomial = np.poly1d(coefficients)
poly_pred = polynomial(future_years)


#geometric series
popt_geometric, _ = curve_fit(geometric_model,years,model_counts)
geometric_pred = geometric_model(future_years,*popt_geometric)


# Missing data compute distributions

In [None]:
compute_DATA = DATA.dropna(subset=['Training compute (FLOP)']) #knocks out ~400 models
nan_compute_DATA = DATA[DATA['Training compute (FLOP)'].isna()]

compute_DATA['log10 Training compute'] = np.log10(compute_DATA['Training compute (FLOP)'])

In [None]:
#inverse weighting

#find counts
years = list(reversed(DATA['Publication bin'].unique()))
bin_range = (15,27)
bins=np.arange(bin_range[0],bin_range[-1],1)
bins_nudged = bins+1
bin_ranges = list(zip(bins,bins_nudged))

YEAR_DATA_STORE = {year:None for year in years}

for idx,year in enumerate(years):
    DATA_STORE = {
        'Bin frequency':None,
        'nan number':None,
    }
    BIN_FREQUENCY = []

    compute_year_filtered = compute_DATA[compute_DATA['Publication bin']==year]

    for bin_range in bin_ranges:
        bin_range_lower_bound_bool = compute_year_filtered['log10 Training compute'] > bin_range[0]
        bin_range_upper_bound_bool = compute_year_filtered['log10 Training compute'] < bin_range[-1]
        bin_range_filtered_df = compute_year_filtered[bin_range_lower_bound_bool & bin_range_upper_bound_bool]
        BIN_FREQUENCY.append(len(bin_range_filtered_df))
    DATA_STORE['Bin frequency'] = BIN_FREQUENCY


    year_nan_number = (nan_compute_DATA['Publication bin']==year).sum()
    DATA_STORE['nan number'] = year_nan_number

    YEAR_DATA_STORE[year] = DATA_STORE

In [None]:
for year in years:
    DATA_STORE = YEAR_DATA_STORE[year]
    compute_bin_counts = np.array(DATA_STORE['Bin frequency'])
    new_compute_bin_counts = compute_bin_counts.copy()
    nan_number = DATA_STORE['nan number']

    #defining probability distribution
    norm_const = 1/sum(compute_bin_counts)
    prob_dist = norm_const * compute_bin_counts
    
    samples = np.random.choice(a=len(bin_ranges),size=nan_number,p=prob_dist)

    #new counts
    for s in samples:
        new_compute_bin_counts[s]+=1


    DATA_STORE['New counts'] = new_compute_bin_counts
    YEAR_DATA_STORE[year]=DATA_STORE

In [None]:
##### PLOT

years = list(reversed(DATA['Publication bin'].unique()))
fig,axs = plt.subplots(nrows=len(years), ncols=1,figsize=(8,12),sharex=True)
bin_range = (15,28)
bins=np.arange(bin_range[0],bin_range[-1],1)

for idx,year in enumerate(years):
    ax = axs[idx]
    filtered_df = compute_DATA[compute_DATA['Publication bin']==year] #year df
    filtered_df['log10 Training compute'].plot(kind='hist',bins=bins,range=bin_range,edgecolor='black',ax=ax)
    ax.set_xlabel('');ax.set_ylabel('')
    ax.set_xlim(bin_range)
    ax.tick_params(axis='y',labelsize=12)
    if idx==0:     ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))

    ax.set_title(f'Year {year}, n={len(filtered_df)}',fontsize=15)
    ax.grid(alpha=0.5)



fig.text(0.5, -0.04, 'Log Compute ($10^X$)', ha='center', fontsize=15)
fig.text(-0.04, 0.5, 'Frequency', va='center', rotation='vertical', fontsize=15)
plt.xticks(bins,fontsize=15)
plt.subplots_adjust(hspace=10)
plt.tight_layout(rect=[0.04, 0.04, 1, 1])


# Combining for prediction