In [38]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
import pandas as pd 
import os 
import numpy as np
import torch 

from law import ScalingLaw, MultiObjScalingLaw

# Functions

In [40]:
def load_break_losses(dirs, slice_list, task_name):
    files_to_exclude = ['trace', 'val_inputs', 'labels', 'proportions', 'emb', 'rouge', 'generations', 'gradient', 'acc', 'matrices']
    df_all = pd.DataFrame() 

    for dir in dirs:
        files = os.listdir(dir)
        files = [os.path.join(dir, f) for f in files] 
        files.sort(key=lambda x: os.path.getmtime(x), reverse=True)

        for file in files:
            if ".log" in file or task_name not in file:
                continue 

            if any(skill not in file for skill in slice_list):
                continue 

            if "break" not in file:
                continue

            break_steps = int(file.split("break_")[-1].split("_")[0])
           
            method = file.split("/")[-1]

            if "skillit" not in file:
                continue 

            runs = os.listdir(file)
            for run in runs:

                if "test_" in run:
                    continue

                if any([exclude_file in run for exclude_file in files_to_exclude]):
                    continue 

                seed = int(run.split("seed_")[-1].split("_")[0])
                checkpoint = int(run.split("-")[-1].split(".")[0])


                path = os.path.join(file, run)

                df = pd.read_pickle(path)

                df = df.rename(columns={"task_idx": "skill", "task_loss": "loss"})

                df["method"] = method
                df["seed"] = seed
                df["checkpoint"] = checkpoint
                df["break_steps"] = break_steps


                df.set_index("checkpoint", inplace=True)


                df_all = pd.concat([df_all, df])


    df_all = df_all.sort_values(by=["checkpoint", "seed"])
    return df_all


In [41]:
def load_resume_losses(dirs, slice_list, task_name):
    files_to_exclude = ['trace', 'val_inputs', 'labels', 'proportions', 'emb', 'rouge', 'generations', 'gradient', 'acc', 'matrices']
    df_all = pd.DataFrame() 

    for dir in dirs:
        files = os.listdir(dir)
        files = [os.path.join(dir, f) for f in files] 
        files.sort(key=lambda x: os.path.getmtime(x), reverse=True)

        for file in files:
            if ".log" in file or task_name not in file:
                continue 

            if any(skill not in file for skill in slice_list):
                continue 

            if "resume" not in file:
                continue

            if "skillit" not in file:
                continue 


            method = file.split("/")[-1]

            new_weight_str = method.split("weights_")[-1].split("_")[0]
            new_a, new_b, new_c = [float(f"0.{weight}") for weight in new_weight_str.split("0.")[1:]]


            break_steps = int(file.split(f"resume_skillit_")[-1].split("_")[0])


            runs = os.listdir(file)
            for run in runs:

                if "test_" in run:
                    continue

                if any([exclude_file in run for exclude_file in files_to_exclude]):
                    continue 

                seed = int(run.split("seed_")[-1].split("_")[0])
                checkpoint = int(run.split("-")[-1].split(".")[0])


                path = os.path.join(file, run)

                df = pd.read_pickle(path)

                df = df.rename(columns={"task_idx": "skill", "task_loss": "loss"})

                df["method"] = method
                df["seed"] = seed
                df["checkpoint"] = checkpoint
                df["break_steps"] = break_steps
                df["new_p1"] = new_a 
                df["new_p2"] = new_b 
                df["new_p3"] = new_c 

                df.set_index("checkpoint", inplace=True)


                df_all = pd.concat([df_all, df])


    df_all = df_all.sort_values(by=["checkpoint", "new_p1", "seed"])
    return df_all


In [42]:
def load_skillit_matrices(slice_list, seed, task_name):
    A = np.load(f"../skillit_graphs/{task_name}_{'_'.join(slice_list)}_normalized_seed_{seed}.npy")
    return A

In [43]:
def calculate_r_squared(actuals, predictions):
    actuals, predictions = actuals.numpy(), predictions.numpy()
    # Calculate the total sum of squares
    total_sum_of_squares = np.sum((actuals - np.mean(actuals)) ** 2)
    # Calculate the residual sum of squares
    residual_sum_of_squares = np.sum((actuals - predictions) ** 2)
    # Calculate R-squared
    r_squared = 1 - (residual_sum_of_squares / total_sum_of_squares)
    return r_squared


In [45]:
def mixing_law_full(x, param):
    # one set of params per skill
    #print(param)

    result = torch.matmul(x, param)
    return result

def init_params_law_full(idx, num_domains=3):
    #for c_i in np.linspace(0.5, 5, 10):
    for _ in range(30):
        ts = [-np.random.rand() if i == idx else np.random.rand() * 0.1 for i in range(num_domains)]
        yield ts

In [47]:
def make_individual_xy_full(df_break, df_resume, skill, break_steps, seed):
    x = []
    y = []

    df_break_subset = df_break[(df_break.break_steps == break_steps) & (df_break.seed == seed) & (df_break.skill == skill)]
    df_break_subset = df_break_subset.loc[df_break_subset.index.max()]

    df_resume_subset = df_resume[(df_resume.break_steps == break_steps) & (df_resume.seed == seed) & (df_resume.skill == skill)]
    df_resume_subset = df_resume_subset.loc[df_resume_subset.index.max()]
    
        
    loss_0 = df_break_subset.loss
    x = df_resume_subset[['new_p1', 'new_p2', 'new_p3']].values
    y = df_resume_subset['loss'].values - loss_0

    return x, y

In [59]:
def make_xy_joint(df_break, df_resume, A, break_steps, seed):
    x = []
    y = []

    df_break_subset = df_break[(df_break.break_steps == break_steps) & (df_break.seed == seed)]
    df_break_subset = df_break_subset.loc[df_break_subset.index.max()]

    df_resume_subset = df_resume[(df_resume.break_steps == break_steps) & (df_resume.seed == seed)]
    df_resume_subset = df_resume_subset.loc[df_resume_subset.index.max()]

    x = df_resume_subset[['new_p1_normalized', 'new_p2_normalized', 'new_p3_normalized']].drop_duplicates(keep='first').values
    x = A.T.dot(x.T) # we have to transpose A because all of the skill-it runs have transposed the A matrix (we kept the original skill-it code)

    l0 = np.tile(df_break_subset['loss'].values, reps=(x.shape[1], 1)).T

    x = np.multiply(l0, x)

    y = df_resume_subset['loss'].values.reshape(-1, 3) - df_break_subset.loss.values
    return x, y

In [50]:
def law_1(x, param):
    b = param[0]
    return b*x[0]

def law_2(x, param):
    b = param[0]
    return b*x[1]

def law_3(x, param):
    b = param[0]
    return b*x[2]


def param_generator_joint():
    for b in np.linspace(-10, 0, 5):
        yield [b]

# Arxiv Book Stackexchange

In [51]:
dirs = ["../output/09272024/", "../output/09262024/"] # REPLACE WITH YOUR RUN OUTPUT DIRECTORIES
task_name = "slimpj"
slice_list = ['arxiv', 'book', 'stackexchange']
df_break = load_break_losses(dirs, slice_list, task_name)

In [52]:
skills = sorted(df_break.skill.unique())

In [53]:
df_resume = load_resume_losses(dirs, skills, task_name)
df_resume['new_p1_normalized'] = df_resume.apply(lambda x: x['new_p1']/(x['new_p1'] + x['new_p2'] + x['new_p3']), axis=1)
df_resume['new_p2_normalized'] = df_resume.apply(lambda x: x['new_p2']/(x['new_p1'] + x['new_p2'] + x['new_p3']), axis=1)
df_resume['new_p3_normalized'] = df_resume.apply(lambda x: x['new_p3']/(x['new_p1'] + x['new_p2'] + x['new_p3']), axis=1)

In [54]:
break_steps = sorted(df_resume.break_steps.unique())
seeds = sorted(df_break.seed.unique())

break_steps = [1000]

## Get $A^{t \star}$ 

In [21]:
params = {skill : {bs: {seed: {} for seed in seeds} for bs in break_steps} for skill in skills }
x_per_skill = {skill : {bs: {seed: {} for seed in seeds} for bs in break_steps} for skill in skills }
y_per_skill = {skill : {bs: {seed: {} for seed in seeds} for bs in break_steps} for skill in skills }

mses = []
r2s = []
for i, skill in enumerate(skills):
    for bs in break_steps:
        for seed in seeds:
            print(skill, bs, seed)
            x, y = make_individual_xy_full(df_break, df_resume, skill, bs, seed)
        
            x_per_skill[skill][bs][seed] = x
            y_per_skill[skill][bs][seed] = y
                        
            law = ScalingLaw(mixing_law_full)
            p = law.fit(x, y, init_params_law_full(i, num_domains=len(skills)), max_step=100, delta=0.02)
            params[skill][bs][seed] = p # param

            prediction_train = mixing_law_full(torch.tensor(x, dtype=torch.float), torch.tensor(p, dtype=torch.float))
            mse_train = torch.nn.functional.mse_loss(prediction_train, torch.tensor(y, dtype=torch.float)).item()
            r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))

            mses.append(mse_train)
            r2s.append(r2_train)


            print(f"MSE: {mse_train}, R2: {r2_train}")


mses = np.array(mses)
r2s = np.array(r2s)

print(mses.mean(), mses.std())
print(r2s.mean(), r2s.std())


arxiv 1000 0


100%|██████████| 30/30 [00:02<00:00, 11.47it/s]


min loss: 0.0001222293358296156
optimal_param: tensor([-0.2545,  0.0382, -0.0666])
MSE: 0.0002700971672311425, R2: 0.9429931755700922
arxiv 1000 1


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 18.01it/s]


min loss: 5.9037811297457665e-05
optimal_param: tensor([-0.3140,  0.0789, -0.0502])
MSE: 0.00011809913848992437, R2: 0.9874185074514449
arxiv 1000 2


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 18.69it/s]


min loss: 9.362303535453975e-05
optimal_param: tensor([-0.2829,  0.0288, -0.0522])
MSE: 0.0002106520114466548, R2: 0.9658413612803263
arxiv 1000 3


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 18.25it/s]


min loss: 0.00011416756024118513
optimal_param: tensor([-0.4235, -0.0128, -0.1170])
MSE: 0.0002447843726258725, R2: 0.9738225407526638
arxiv 1000 4


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 18.44it/s]


min loss: 0.00019273950601927936
optimal_param: tensor([-0.3924,  0.0383, -0.1098])
MSE: 0.0005644520279020071, R2: 0.9448499495047843
book 1000 0


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 17.67it/s]


min loss: 0.0002081194834318012
optimal_param: tensor([ 0.0353, -0.2970, -0.0181])
MSE: 0.000556315528228879, R2: 0.9129804942733457
book 1000 1


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 18.72it/s]


min loss: 7.42196207283996e-05
optimal_param: tensor([ 0.0223, -0.2738,  0.0433])
MSE: 0.0001484392414567992, R2: 0.9801734216132441
book 1000 2


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 17.28it/s]


min loss: 9.882343874778599e-05
optimal_param: tensor([ 0.0544, -0.2720,  0.0536])
MSE: 0.00019994202011730522, R2: 0.9740782138848112
book 1000 3


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 18.70it/s]


min loss: 2.3274658815353177e-05
optimal_param: tensor([ 0.0292, -0.2874,  0.0097])
MSE: 4.6549317630706355e-05, R2: 0.9921175420864888
book 1000 4


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 16.40it/s]
  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))


min loss: 0.0004254251252859831
optimal_param: tensor([ 0.0597, -0.2962,  0.0106])
MSE: 0.001375290798023343, R2: 0.8199159035123161
stackexchange 1000 0


100%|██████████| 30/30 [00:01<00:00, 17.06it/s]


min loss: 0.00018710180302150548
optimal_param: tensor([ 0.0516,  0.0782, -0.2213])
MSE: 0.0005905625293962657, R2: 0.8868305452480219
stackexchange 1000 1


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:02<00:00, 14.01it/s]


min loss: 0.0001999284722842276
optimal_param: tensor([-0.0014,  0.0873, -0.2832])
MSE: 0.0004767045029439032, R2: 0.9401516801345832
stackexchange 1000 2


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 18.30it/s]


min loss: 0.00015331119357142597
optimal_param: tensor([ 0.0062,  0.0779, -0.2767])
MSE: 0.00035897427005693316, R2: 0.948159928237459
stackexchange 1000 3


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:02<00:00, 11.78it/s]


min loss: 0.0001651076163398102
optimal_param: tensor([-0.0355,  0.0522, -0.2519])
MSE: 0.000381678604753688, R2: 0.9345704259691378
stackexchange 1000 4


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))
100%|██████████| 30/30 [00:01<00:00, 17.25it/s]


min loss: 7.297917181858793e-05
optimal_param: tensor([-0.0190,  0.0638, -0.3009])
MSE: 0.0001464509405195713, R2: 0.9771432812766424
0.00037926616472153303 0.0003150922756394066
0.9454031313863575 0.043664815197527615


  r2_train = calculate_r_squared(torch.tensor(y), torch.tensor(prediction_train))


In [22]:
import pickle 

with open("./law_results/arxiv_books_stackexchange/params_skillit_trajectory_opt_1000.pkl", "wb") as f:
    pickle.dump(params, f)

## Get $\tilde{A}^t = b^t A^t$

In [60]:
skillit_matrices = {}
for seed in seeds:
    skillit_matrices[seed] = load_skillit_matrices(slice_list, seed, task_name)

In [61]:
params_skillit = {bs: {seed: {} for seed in seeds} for bs in break_steps}


x_per_skill_skillit = {bs: {seed: {} for seed in seeds} for bs in break_steps}
y_per_skill_skillit = {bs: {seed: {} for seed in seeds} for bs in break_steps}

for bs in break_steps:
    for seed in seeds:
        print(bs, seed)
        x, ys = make_xy_joint(df_break, df_resume, skillit_matrices[seed], bs, seed)
        
        x_per_skill_skillit[bs][seed] = x
        y_per_skill_skillit[bs][seed] = ys
                    
        law = MultiObjScalingLaw([law_1, law_2, law_3])
        p = law.fit(x, ys.T, param_generator_joint(), max_step=100, delta=0.02)
        params_skillit[bs][seed] = p # param

1000 0
workers: 96


100%|██████████| 5/5 [00:01<00:00,  3.01it/s]


min loss: 0.02710052113980055
optimal_param: tensor([-0.0275])
1000 1
workers: 96


100%|██████████| 5/5 [00:01<00:00,  4.15it/s]


min loss: 0.03443594090640545
optimal_param: tensor([-0.0276])
1000 2
workers: 96


100%|██████████| 5/5 [00:01<00:00,  4.04it/s]


min loss: 0.03130367677658796
optimal_param: tensor([-0.0283])
1000 3
workers: 96


100%|██████████| 5/5 [00:01<00:00,  4.16it/s]


min loss: 0.03555733431130648
optimal_param: tensor([-0.0414])
1000 4
workers: 96


100%|██████████| 5/5 [00:01<00:00,  4.06it/s]


min loss: 0.03412032313644886
optimal_param: tensor([-0.0439])


In [62]:
import pickle 

with open("./law_results/arxiv_books_stackexchange/params_skillit_trajectory_skillit_matrix_1000.pkl", "wb") as f:
    pickle.dump(params_skillit, f)