In [None]:
!pip install pwlf #for colab

In [None]:
import time

modules = [
    ('numpy', 'np'),
    ('scipy.stats', 'stats'),
    ('scipy.optimize', 'optimize'), 
    ('matplotlib.pyplot', 'plt'), 
    ('pandas', 'pd'),
    ('seaborn', 'sns'),
    ('itertools', 'itertools'),
    ('copy', 'copy'),
    ('re', 're'),
    ('pdb', 'pdb'),
    ('logging', 'logging')
]

for module, alias in modules:
    start = time.time()
    exec(f"import {module} as {alias}")
    end = time.time()
    print(f"{module}: {end - start:.4f} seconds")

In [8]:
import numpy as np
from scipy import stats, optimize
import matplotlib.pyplot as plt
import pandas as pd #taking long to load here
import seaborn as sns
import itertools
import copy,re, pdb, logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger=logging.getLogger(__name__)


In [9]:
# df = pd.read_csv("https://epochai.org/data/epochdb/notable_systems.csv")
url = 'https://drive.google.com/file/d/1RLLKPU3bEYK65wlQlU0p20u9M8cHkLMl/view?usp=sharing'
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]

df = pd.read_csv(url)

df = df[~df["Notability criteria"].isna()]

df["compute"] = df["Training compute (FLOP)"]
df["date"] = df["Publication date"]
df["model"] = df["System"]
df["poss1e23"] = df["Possibly over 1e23 FLOP"]
df["poss1e25"] = df["Estimated over 1e25 FLOP"]
df["cost"] = df["Training compute cost (2023 USD)"]
df["cost"] = df["cost"].str.replace(",", "").str.replace("$", "").astype(float)

df = df[["model", "compute", "date", "cost", "poss1e23", "poss1e25"]]

In [10]:
to_remove = ['AlphaGo Zero','AlphaZero'] #outliers
df = df[~df["model"].isin(to_remove)]

In [11]:
to_append = [
  ["Claude 3.5 Sonnet", 4.3e25, "2024-06-21", np.nan, np.nan, np.nan],
  ["GPT-4o Mini", 1.2e25, "2024-07-18", np.nan, np.nan, np.nan],
]

for row in to_append:
  if row[0] not in df["model"].values:
    df.loc[len(df)] = row

In [None]:
to_add_compute = {
    "Claude 3 Opus": 2.5e25,
    "Claude 3 Sonnet": 1.1e25,
    "GPT-4o": 2.9e25,
    "Gemini 1.0 Pro": 2.8e24,
    "Gemini 1.5 Pro": 1.9e25,
    "Reka Core": 8.4e24,
    "GPT-4 Turbo": 2.1e25,  # rough guess
    "GPT-4V": 2.1e25,  # rough guess
    "Claude 2.1": df[df["model"]=="Claude 2"]["compute"].values,  # rough guess
}

logger.info('Can add more recent models here')


for k, v in to_add_compute.items():
  if df.loc[df["model"] == k, "compute"].isna().values:
    df.loc[df["model"] == k, "compute"] = v
  else:
    print(f"{k} already has a compute value")

In [13]:
# Reset the ones we've set
df.loc[~df["compute"].isna(), "poss1e23"] = np.nan
df.loc[~df["compute"].isna(), "poss1e25"] = np.nan

# Set some temporary placeholder values
# TODO: revisit
# df.loc[(df["poss1e25"] == "checked"), "compute"] = 1.01e25  # placeholder
# df.loc[((df["poss1e23"] =="checked") & (df["poss1e25"] != "checked")), "compute"] = 1.01e23  # placeholder

# We want to handle these leading models manually via the above compute estimates.
assert df[(df["poss1e25"] == "checked") & (df["compute"].isna())].size == 0

# We sample 1e23-1e25 models with unknown compute from the existing empirical distribution.
# TODO: revisit
poss1e23 = ((df["poss1e23"] == "checked") & (df["poss1e25"] != "checked"))
df.loc[poss1e23, "compute"] = df[(df["compute"] >= 1e23) & (df["compute"] < 1e25)]["compute"].sample(poss1e23.sum(), random_state=0).values

df["date"] = pd.to_datetime(df["date"])
df["log_compute"] = np.log10(df["compute"])

df["date_float"] = df["date"].dt.year + df["date"].dt.month/12

df['year'] = df['date'].dt.year

df = df.sort_values("date")
df.dropna(subset="compute", inplace=True)

In [None]:
#generate basic scatterplot
if 1:
    fig = sns.scatterplot(data=df[df['date']>'2010-01-01'], x='date',y='compute')
    fig.set(yscale='log')
    plt.grid(alpha=0.5)

    # Add line of best fit for historical data
    historical_data = df[df['date']>'2010-01-01']
    x = historical_data['date'].astype(np.int64) // 10**9  # Convert to unix timestamp
    y = historical_data['compute']
    z = np.polyfit(x, np.log(y), 1)
    p = np.poly1d(z)
    plt.plot(historical_data['date'], np.exp(p(x)), 'b--', alpha=0.8)

    future_dates = pd.date_range(start='2025-01-01', end='2029-12-31', periods=200)
    base = 1e25  # Starting point based on 2024 level
    noise = np.random.normal(0, 10, len(future_dates))
    years_from_2025 = (future_dates.year - 2025)

    growth_rate = 3.0  # Exponential growth rate
    future_compute = base * np.exp(growth_rate * years_from_2025) * (1 + noise)
    plt.scatter(future_dates, future_compute, alpha=0.3, color='red', label='Projected - business as usual')

    growth_rate = 0.4
    future_compute = base * np.exp(growth_rate * years_from_2025) * (1 + noise)
    plt.scatter(future_dates, future_compute, alpha=0.3, color='green', label='Projected - inference scaling')

    plt.legend()
    plt.xlim([pd.Timestamp('2020-01-01'),pd.Timestamp('2030-01-01')])

    for exp in range(25,31):
        plt.axhline(y=10**exp,color='gray',linestyle='--',alpha=0.6)



In [None]:
FLOP_dollar=2e25/100e6 #FLOP per dollar conversion ~2023 (GPT-4 was ~2e25 FLOP for estimated $1e8)


fig = sns.scatterplot(data=df[df['date']>'2010-01-01'], x='date',y=(1/FLOP_dollar)*df['compute'])
fig.set(yscale='log')
plt.grid(alpha=0.5)
plt.axhline(y=1e14,label='World GDP',color='red',linestyle='--',alpha=0.8)
plt.axhline(y=30e12,label='US GDP',color='orange',linestyle='--',alpha=0.8)
plt.axhline(y=40e9,label='Meta R&D budget 2023',color='green',linestyle='--',alpha=0.8)
plt.axhline(y=100e6,label='GPT-4 training cost (est)',color='purple',linestyle='--',alpha=0.8)

# Add future projections
future_dates = pd.date_range(start='2024-01-01', end='2029-12-31', periods=500)
base = (1/FLOP_dollar) * 2e25  # Starting point based on 2024 level
noise = np.random.normal(0, 10, len(future_dates))
years_from_2024 = (future_dates.year - 2024)

growth_rate = 3.0  # Exponential growth rate
future_costs = base * np.exp(growth_rate * years_from_2024) * (1 + noise)
plt.scatter(future_dates, future_costs, alpha=0.3, color='red', label='Projected - business as usual')

#growth_rate = 0.4
#future_costs = base * np.exp(growth_rate * years_from_2024) * (1 + noise)
#plt.scatter(future_dates, future_costs, alpha=0.3, color='green', label='Projected - inference scaling')



plt.legend()
plt.xlim([pd.Timestamp('2020-01-01'),pd.Timestamp('2030-01-01')])
plt.ylabel("Training compute cost ($)")

In [None]:


year_filter=[2020,2021,2022,2023]
group_param=5
table=pd.DataFrame(index=[f'Group {i}' for i in range(group_param)],columns=year_filter)


for year in df['date'].dt.year.unique():
    if year not in year_filter: continue
    year_data = df[df['date'].dt.year == year]
    print(f"\nYear: {year}")
    sorted_year_data=year_data.sort_values(by='compute',ascending=False)['compute']
    grouped_data=pd.qcut(sorted_year_data,q=group_param,labels=False)
    for group in range(group_param):
        group_data = sorted_year_data[grouped_data == group]
        group_share = group_data.sum() / year_data['compute'].sum() * 100
        table.loc[f'Group {group}',year] = group_share
        print(f"Group {group}: {group_share:.1f}% of total compute")



# Plot pie chart of latest year's data
latest_year = max(year_filter)
latest_data = table[latest_year]
plt.figure(figsize=(8,8))
plt.pie(latest_data, labels=[f'Group {i}' for i in range(group_param)], autopct='%1.1f%%')
plt.title(f'Share of Total Compute by Group ({latest_year})')



        
    



    


In [None]:
from sklearn.linear_model import LinearRegression

FLOP_dollar_2024 = 2e25/100e6
dollar_FLOP_2024 = 1/FLOP_dollar_2024
year_grouped_df=df.groupby(df['date'][df['date']>'2010-01-01'].dt.year)
aggregate_compute=year_grouped_df['compute'].sum()
aggregate_compute_cost=aggregate_compute*dollar_FLOP_2024
log_aggregate_compute=np.log10(aggregate_compute)
log_aggregate_compute_cost=np.log10(aggregate_compute_cost)
#plot
# Plot historical data
plt.figure(figsize=(10,6))
plt.scatter(year_grouped_df.groups.keys(), log_aggregate_compute_cost, label='Historical data')

# Fit exponential for extrapolation
# Linear regression
x = np.array(list(year_grouped_df.groups.keys())).reshape(-1, 1)
y = log_aggregate_compute_cost.values
reg = LinearRegression().fit(x, y)

# Generate future years for extrapolation
future_years = np.arange(max(x), 2030).reshape(-1, 1)

# Get predictions
future_predictions = reg.predict(future_years)


# Plot extrapolation
plt.plot(future_years, future_predictions, '--', label='Extrapolation')
plt.xlabel('Year')
plt.ylabel('Log10(Total Compute)')
plt.legend()
plt.grid(True)

In [None]:
total_compute_2028 = 1e30
cost_2024 = total_compute_2028 * dollar_FLOP_2024
print(f"With 2024 FLOP/dollar costs, the cost of {total_compute_2028} FLOP is approx {cost_2024/1e12:,.2f} trillion USD")

#case 1 - ~ 9 models with 1e29, 100 models with 1e27 
#case 1 - ~ 9 models with 

#case 2 - ~10000 models with 1e26, 0 models above that

#case 3 - 1 model 1e29, 10 models 1e28, 100 models 1e27, 1000 models 1e26 etc. 

years_to_iter=[2020,2021,2022,2023]
fig,axs=plt.subplots(nrows=2,ncols=2,figsize=(8,6)); axs_ravel=axs.ravel()
kde_fig,kde_axs=plt.subplots(nrows=2,ncols=2,figsize=(8,6)); kde_axs_ravel=kde_axs.ravel()

def percentage_formatter(x,pos):
        return f'{x:.6f}%'



for idx,year in enumerate(years_to_iter):
        ax,kde_ax=axs_ravel[idx], kde_axs_ravel[idx]
        total_compute=aggregate_compute[aggregate_compute.index==year].values
        cost_2023=total_compute*dollar_FLOP_2024
        datapoints_year=df[df['date'].dt.year==year]['compute']
        mean_log_compute=np.log10(datapoints_year).mean()

        #prep data
        sorted_computes=np.sort(datapoints_year)
        norm_factor=total_compute[0]
        norm_sorted_computes=sorted_computes/norm_factor
        cumsum=np.cumsum(sorted_computes)
        norm_cumsum=cumsum/norm_factor



        #T-m plot
        ax.plot(norm_sorted_computes,norm_cumsum)
        ax.scatter(norm_sorted_computes, norm_cumsum, alpha=0.5, color='blue', s=30,marker='x')

        ax.grid(True,alpha=0.3)
        ax.set_xscale('log'); ax.set_yscale('log')
        #ax.set_xlim([1e18,1e27])
        ax.set_xlabel('individual model size'); ax.set_ylabel('Total training compute')
        ax.set_title(f'Year: {year}')
        ax.text(0.05, 0.95, f'Total compute: {total_compute[0]:.2e} FLOP', 
                transform=ax.transAxes, verticalalignment='top')
        ax.axhline(y=norm_cumsum[-1],color='r',linestyle='--')
        ax.axvline(x=1,color='g',linestyle='--',alpha=0.5)
        ax.text(1,ax.get_ylim()[0],f'{norm_factor:.2e}',
                rotation=90,fontsize=8,verticalalignment='top')
        ax.yaxis.set_major_formatter(percentage_formatter)

        #KDE plot 
        kde=stats.gaussian_kde(np.log10(norm_sorted_computes))
        x_range=np.logspace(np.log10(norm_sorted_computes).min(),np.log10(1))
        kde_ax.plot(x_range,kde(np.log10(x_range)))
        kde_ax.set_xscale('log')
        kde_ax.set_title(f'Year: {year}')
        kde_ax.grid(alpha=0.5)

        kde_ax.axvline(x=1,color='g',linestyle='--',alpha=0.5)
        kde_ax.text(1,ax.get_ylim()[0],f'{norm_factor:.2e}',
                rotation=90,fontsize=8,verticalalignment='top')
        if idx>=2: kde_ax.set_xlabel('Model compute (normalised by total)')

fig.tight_layout(pad=2.0)
kde_fig.tight_layout(pad=2.0)

In [None]:
T=245
N=10
a,b=23,26

# Generate all possible integer combinations between log_a and log_b
possible_values = np.arange(a, b+1).astype(float)
all_combinations = list(itertools.combinations_with_replacement(possible_values, N))

# Filter combinations that sum to target
valid_combinations = []
for combo in all_combinations:
    if np.sum(combo)==T:
        valid_combinations.append(combo)

valid_distributions = np.array(valid_combinations)

print(valid_distributions)