In [None]:
!pip install transformers datasets accelerate more-itertools unidiff optuna

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch.cuda

for i in range(torch.cuda.device_count()):
  print("device {0}: {1}".format(i, torch.cuda.get_device_name(i)))
  print(" capability: {0}".format(torch.cuda.get_device_capability(i)))
  print(" properties: {0}".format(torch.cuda.get_device_properties(i)))

In [None]:
import sys
import pandas as pd
import time
sys.path.append('/content/drive/MyDrive/laredo/llm-research/src')
sys.path.append('/content/drive/MyDrive/laredo/llm-research/src/eval')
import test_runner

def summarize(df): 
  import matplotlib.pyplot as plt
  import seaborn as sns

  sns.set_style('dark')

  percentiles=[0.01, 0.05, 0.1, .25, .75, .9, .95, .99]

  display("Overall eval results")
  df = df.sort_values(by=['eval_idx'])

  display(df.groupby(['name'])['score'].describe(percentiles=percentiles))

  with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 400): 
    display("Eval results grouped by test case location.")
    display(df.groupby(['name', 'location'])['score'].describe(percentiles=percentiles))

  mean_global = df.groupby(['name'])['score'].mean().to_frame('global').transpose()
  mean_by_loc = pd.pivot_table(df, values='score', index='location', columns=['name'])

  table = pd.concat([mean_global, mean_by_loc])

  display (table) 

  table.plot(kind='bar', legend=True)
  plt.rcParams['figure.figsize'] = [15, 15]
  plt.title("Mean by evalset and test location")
  plt.xticks(rotation=30, horizontalalignment='center')
  plt.xlabel("location")
  plt.ylabel("mean tokens")


def diff_frames(a, b, df, rows=10):
  sliced_frames = df[(df.name.str.startswith(a)) | (df.name.str.startswith(b))]
  sliced_frames = sliced_frames.sort_values(by=['eval_idx', 'commit_hash', 'location', 'instance']).reset_index(drop=True)
  sliced_frames['diff'] = sliced_frames.groupby(['commit_hash', 'location', 'instance'])['score'].diff()

  diffs = sliced_frames[['commit_hash', 'location', 'instance', 'score', 'diff']].dropna().sort_values(by='diff', ascending=False)

  with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 100): 
    display(diffs.head(rows))
    display(diffs.tail(rows))


def eval_datasets(tuples, split="validation"): 
  frames = []
  for idx, t in enumerate(tuples): 
    dataset_path = f"/content/drive/MyDrive/laredo/data/{t[2]}"

    print(f"Starting evaluation of {t[1]} with {dataset_path}[{split}]")

    df = test_runner.eval_dataset(t[0], tokenizer, dataset_path, split)
    df['name'] = f"{t[1]} - {t[2]}"
    df['eval_idx'] = idx;
    frames.append(df)

  return pd.concat(frames)

In [None]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch

model_path = "/content/drive/MyDrive/laredo/models/PySnooper"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = GPT2TokenizerFast.from_pretrained("/content/drive/MyDrive/laredo/tokenizer")

codeparrot_model = GPT2LMHeadModel.from_pretrained("lvwerra/codeparrot-small")
codeparrot_model = codeparrot_model.to(device)
codeparrot_model.config.update({ "reorder_and_upcast_attn": True, "scale_attn_by_inverse_layer_idx": True})
codeparrot_model.resize_token_embeddings(len(tokenizer))

pysnooper_model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/laredo/models/PySnooper")
pysnooper_model = pysnooper_model.to(device)

background_model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/laredo/models/background-20220330")
background_model = background_model.to(device)

df = eval_datasets(
    [(codeparrot_model, 'CodeParrot Model', 'PySnooper'), 
     (pysnooper_model, 'PySnooper Model', 'PySnooper')]
)




In [None]:
summarize(df)

In [None]:
diff_frames("CodeParrot Model", "PySnooper Model", df)  