In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression


random_seed = 42
np.random.seed(random_seed)

In [None]:
def half_year_bin(date):
    #CHATGPT generated

    if date.month <= 6:
        return f'{date.year}-H1'
    else: 
        return f'{date.year}-H2'

def year_bin(date):
    return date.year

In [None]:
csv_path = '/Users/iyngkarrankumar/Documents/Misc/Tracking models/data/all_systems.csv'
DATA = pd.read_csv(csv_path)
start_year = 2017
DATA['Publication date'] = pd.to_datetime(DATA['Publication date'])
DATA = DATA[DATA['Publication date'] > f'{start_year}-01-01'] #data filtering

bin_type = 'year'
if bin_type=='year':
    DATA['Publication bin'] = DATA['Publication date'].apply(year_bin)
elif bin_type=='half year':
    DATA['Publication bin'] = DATA['Publication date'].apply(half_year_bin)

In [None]:
compute_DATA = DATA.dropna(subset=['Training compute (FLOP)'])
compute_DATA['log10 Training compute'] = np.log10(compute_DATA['Training compute (FLOP)'])

### FLOP breakdown

- Takeaway from the Bin_fractions dataframe below: The frontier bin and the following bin (top four OOMs) carry all of the aggregate compute

In [None]:
#booleans
REMOVE_2024 = False
DOUBLE_2024 = True #just assume second half of 2024 is a copy of the first half of 2024

years = sorted(list(DATA['Publication bin'].unique()))

if REMOVE_2024:
    years.pop()

min_FLOP = 15
max_FLOP = 26
FLOP_step = 2 
FLOP_bin_low = np.arange(min_FLOP,max_FLOP,FLOP_step)
FLOP_bin_high = FLOP_bin_low + 2
FLOP_bins = list(zip(FLOP_bin_low,FLOP_bin_high))

#initialising data structs
AGGREGATE_FLOP = []
BIN_fractions = pd.DataFrame(columns=[str(f_bin) for f_bin in FLOP_bins])


for year in years:
    year_filtered_df = compute_DATA[compute_DATA['Publication bin']==year]

    total_executed_FLOP = year_filtered_df['Training compute (FLOP)'].sum()
    total_executed_log_FLOP = np.log10(total_executed_FLOP)

    YEAR_AGGREGATE_FLOP = []
    for idx,FLOP_bin in enumerate(FLOP_bins):

        FLOP_filtering_condition = (year_filtered_df['log10 Training compute'] > FLOP_bin[0]) & (year_filtered_df['log10 Training compute'] < FLOP_bin[-1])
        FLOP_filtered_df = year_filtered_df[FLOP_filtering_condition]

        year_bin_aggregate_FLOP = (FLOP_filtered_df['Training compute (FLOP)']).sum() #total FLOP of all training runs in year Y and bin B
        YEAR_AGGREGATE_FLOP.append(year_bin_aggregate_FLOP)

        if year==2024 and DOUBLE_2024: #double all entries for 2024
            YEAR_AGGREGATE_FLOP = [2*elem for elem in YEAR_AGGREGATE_FLOP]


    year_aggregate_FLOP = sum(YEAR_AGGREGATE_FLOP) #total FLOP of all runs in year Y


    BIN_fractions.loc[year] = np.round((YEAR_AGGREGATE_FLOP/year_aggregate_FLOP)*100,2)
    AGGREGATE_FLOP.append(year_aggregate_FLOP)

log10_AGGREGATE_FLOP = np.log10(AGGREGATE_FLOP)

plt.plot(years,log10_AGGREGATE_FLOP)
plt.ylabel('Aggregate training FLOP')

In [None]:
#Extrapolating
end_year = 2028

if REMOVE_2024:
    future_years = np.arange(2024,end_year+1,1)
else:
    future_years = np.arange(2025,end_year+1,1)
    

assert len(log10_AGGREGATE_FLOP)==len(years)

#prep data
years_arr = np.array(years).reshape(-1,1)
future_years_arr = future_years.reshape(-1,1)

model = LinearRegression()
model.fit(years_arr,log10_AGGREGATE_FLOP)
aggregate_FLOP_predictions = model.predict(future_years_arr)

In [None]:
fig,ax=plt.subplots()

ax.scatter(years,log10_AGGREGATE_FLOP,label='Observed')
ax.scatter(future_years,aggregate_FLOP_predictions,label='Predictions')
ax.set_ylabel('Aggregate training FLOP ($10^X$)')
ax.legend()