In [1]:
#from huggingface_hub import login
#login()

In [2]:
from transformers import set_seed

seed = 42
set_seed(seed)

2024-09-25 12:39:38.166951: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-25 12:39:38.282559: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


2024-09-25 12:39:38.750452: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-09-25 12:39:38.750509: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [3]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

lora_config = LoraConfig(
    r = 64,
    lora_alpha = 16,
    init_lora_weights = False,
    lora_dropout = 0.1,
    bias = 'none',
    task_type="CAUSAL_LM"
)

In [None]:
from transformers import AutoTokenizer
from peft import prepare_model_for_kbit_training, get_peft_model
from transformers import DataCollatorForLanguageModeling

model_name = "TinyLlama/TinyLlama_v1.1"

def init_tokenizer(model_name):
    print(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.bos_token = "<func>"
    tokenizer.eos_token = "</func>"
    print(tokenizer)
    tokenizer.pad_token = "</s>"
    return tokenizer

In [5]:
def init_model(model_name, tokenizer, bnb_config, lora_config):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        revision="main",
        quantization_config=bnb_config
    )

    model.config.use_cache = False
    model = prepare_model_for_kbit_training(model)
    model.config.pad_token_id = tokenizer.pad_token_id

    return model


#model = init_model(model_name, tokenizer, bnb_config, lora_config)

In [6]:
def init_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )


In [7]:
def print_trainable_parameters(model):
  """
  Prints the number of trainable parameters in the model.
  """
  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_params += param.numel()
  print(
    f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
  )

In [8]:
def init_args():
    training_args = TrainingArguments(
        output_dir=f"./results/chunked/v1.1",
        label_names=['input_ids'],
        weight_decay=0.01,
        num_train_epochs=1,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,
        dataloader_num_workers=4,
        logging_dir="./logs",
        logging_strategy="steps",
        learning_rate=1e-5,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={'use_reentrant':False},
        fp16=True,
        no_cuda=False,
        #tf32=True,
        optim="paged_adamw_8bit",
        logging_steps=250,
        save_strategy="no",
        #save_steps = 800,
        eval_strategy="steps",
        eval_steps=3200
    )
    return training_args

# TODO custom data loader?

In [9]:
from transformers import Trainer

def init_trainer(model, args, train, valid, tokenizer, data_collator):
    return Trainer(
        model=model,
        args=args,
        train_dataset=train,
        eval_dataset=valid,
        tokenizer=tokenizer,
        data_collator=data_collator
    )


In [None]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import gc

def add_special_tokens(example):
    example['body'] = f"{tokenizer.bos_token} {example['body']} {tokenizer.eos_token}"
    return example


def tokenize_function(examples):
    return tokenizer(
    examples['body'], 
    return_tensors="np", 
    padding="max_length",
)


def sample_fixed_per_group(df, n_samples, random_state=None):
    return df.groupby("language").apply(lambda x: x.sample(n=n_samples, random_state=random_state)).reset_index(drop=True)


def stratified_sample(df, frac, random_state=None):
    grouped = df.groupby('language')
    sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)
    return sampled_df


def load_dataset(seed, data_split_type):
    """
    df = pd.read_parquet("./data/filtered_funcs.parquet")
    if data_split_type == "fixed":
        samples_per_group = round(len(df)/1000)
        df = sample_fixed_per_group(df, n_samples=samples_per_group,random_state=seed)
    elif data_split_type == "stratified":
        df = stratified_sample(df, frac=0.01, random_state=seed)
    
    #df = pd.read_parquet(f"data/1percent_fixed_{seed}.parquet")
    """
    df = pd.read_parquet(f"data/chunks/chunk_{seed+1}.parquet")
    
    train, valid = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)
    
    ds = DatasetDict({
        'train': Dataset.from_pandas(train),
        'valid': Dataset.from_pandas(valid)}
    )
    
    ds = ds.map(add_special_tokens)
    tokenized_ds = ds.map(tokenize_function, batched=True)
    print(tokenized_ds)
    
    del df
    del train
    del valid
    del ds
    gc.collect()
    
    return tokenized_ds


In [11]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [12]:
import numpy as np

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path="results/chunked/v.1.1/checkpoint-18",
    device_map="auto",
    quantization_config=bnb_config
)

tokenizer = init_tokenizer(model_name)
data_collator = init_collator(tokenizer)

model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model.config.pad_token_id = tokenizer.pad_token_id
model = get_peft_model(model, peft_config=lora_config)
print_trainable_parameters(model)

data_split_type = "v.1.1"
training_args = init_args()
 
tokenized_ds = load_dataset(0, data_split_type)
trainer = init_trainer(
    model,
    training_args,
    tokenized_ds["train"],
    tokenized_ds["valid"],
    tokenizer, 
    data_collator
)

# skipped = [3,4,9,11,13,19,20,22,23,25,27,28]
# 22 failed at [5907/6401 6:59:08 < 35:03, 0.23 it/s, Epoch 0.92/1]
# 23 failed at [6095/6401 7:10:49 < 21:38, 0.24 it/s, Epoch 0.95/1] (RuntimeError: NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_() INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":813, please report a bug to PyTorch.)
num_epochs = 100
for i in [24,26,29,30,31,32,33,34,35,37]:
    print(i)
    if i:
        tokenized_ds = load_dataset(i, data_split_type)
        
        trainer.train_dataset = tokenized_ds["train"]
        trainer.eval_dataset = tokenized_ds["valid"]
        
    print(trainer.train_dataset[0])
        
        #trainer.args.num_train_epochs = i+1

    trainer.train()
    trainer.save_model(f"./results/chunked/{data_split_type}/checkpoint-{i}")



TinyLlama/TinyLlama-1.1B-Chat-v1.0
LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-Chat-v1.0', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<func>', 'eos_token': '</func>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


trainable params: 9011200 || all params: 1109059584 || trainable%: 0.8125081943298008


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12804
    })
})
24


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12804
    })
})
{'language': 'Python', 'body': "<func> # Python\n# Apply 2D convolution to compute the transverse wake Wx on a grid \n# Also returns the zvec and xvec which define the grid\n# \n# Still needs to improve the convolution step\nWxOld(gamma, rho, sigmaz, sigmax, dz, dx):\nbeta = (1 - 1 / gamma ** 2) ** (1 / 2)\nzvec = np.arange(-5 * sigmaz, 5 * sigmaz, dz)\nxvec = np.arange(-5 * sigmax, 5 * sigmax, dx)\nlambdap_list = [[lambda_p_Gauss(i, j) for j in xvec] for i in zvec]\nlambdap_grid = np.array(lambdap_list, dtype=float)\nzvec2 = np.arange(-10 * sigmaz, 10 * sigmaz, dz)\nxvec2 = np.arange(-10 * sigmax, 10 * sigmax, dx)\npsi_x_list = [[psi_x(i / 2 / rho, j, beta) for j in x

[2024-09-25 12:40:31,083] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)




/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):




  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
3200,1.4266,1.435516
6400,1.4159,1.42414


26


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12804
    })
})
{'language': 'Python', 'body': '<func> # Python\n# recc: tamper ecc file and repair by hamming distance \ntest_repair_by_hamming():\nfiledb, filedb_bak = get_db()\nfileout = path_sample_files(\'output\', \'recc_file_repaired.db\')\nmarker1 = get_marker(type=1)\nmarker2 = get_marker(type=2)\nrestore_files(\'hecc\')\nstartpos1 = find_next_entry(filedb, marker1).next()\nstartpos2 = find_next_entry(filedb, marker1, startpos1 + len(marker1)).next()\nstartpos3 = find_next_entry(filedb, marker2, startpos2 + len(marker1)).next()\ntamper_file(filedb, startpos1, \'a\' * int(len(marker1) * 0.3))\ntamper_file(filedb, startpos2, \'a\' * int(len(marker1) * 0.3))\ntamper_file(filedb,

Step,Training Loss,Validation Loss
3200,1.4008,1.402376
6400,1.3822,1.397744


29


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12804
    })
})
{'language': 'Python', 'body': "<func> # Python\n# Test a front-end user getting an invite to admin site\ntest_staff_social_adaptor(self):\nrequest = self.factory.get('/')\nrequest._wagtail_site = self.main.get_site()\nadaptor = StaffUserSocialAdapter(request=request)\nuser = get_user_model().objects.create_user(username='testuser', email='testuser@email.com', password='pass')\nsociallogin = SocialLogin(user=user)\ngroup = Group.objects.filter().first()\nperm = Permission.objects.filter().first()\nself.assertFalse(adaptor.is_open_for_signup(request, sociallogin))\ninvite = Invite.objects.create(email=user.email, user=self.user, site=self.site)\ninvite.groups.add(group)

Step,Training Loss,Validation Loss
3200,1.3848,1.390017
6400,1.3821,1.387251


30


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12804
    })
})
{'language': 'Python', 'body': '<func> # Python\n# dummy function for use with check_grad()\nget_grad(_):\nreturn dL_dr_vec </func>', 'split': 30, '__index_level_0__': 441239, 'input_ids': [1, 529, 9891, 29958, 396, 5132, 13, 29937, 20254, 740, 363, 671, 411, 1423, 29918, 5105, 580, 13, 657, 29918, 5105, 7373, 1125, 13, 2457, 270, 29931, 29918, 7707, 29918, 2003, 1533, 9891, 29958, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2

Step,Training Loss,Validation Loss
3200,1.3833,1.376973
6400,1.366,1.375233


31


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12804
    })
})
{'language': 'Python', 'body': '<func> # Python\n# Gives the value of `self.rep_hash[brn]["libraries"]` if brn exists.\n# \n# Returns:\n#     `dict`: Empty if brn doesn\'t exist, otherwise the value of self.rep_hash[brn]["libraries"].\nget_tech_rep_hash(self, brn):\nfor bio_acc in self.rep_hash:\n    if brn == self.rep_hash[bio_acc][\'brn\']:\n        return self.rep_hash[bio_acc][\'libraries\']\nreturn {} </func>', 'split': 31, '__index_level_0__': 437609, 'input_ids': [1, 529, 9891, 29958, 396, 5132, 13, 29937, 402, 3145, 278, 995, 310, 421, 1311, 29889, 3445, 29918, 8568, 29961, 1182, 29876, 29962, 3366, 492, 8464, 3108, 29952, 565, 1506, 29876, 4864, 29889, 13, 299

Step,Training Loss,Validation Loss
3200,1.3855,1.380555
6400,1.3714,1.379118


32


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12804
    })
})
{'language': 'Python', 'body': "<func> # Python\n# References\n# ----------\n# [1] Heymans, M., Singh, A. Deriving phylogenetic trees from the similarity\n#     analysis of metabolic pathways. Bioinformatics, 2003\nnsim_hs03(G1, G2, max_iter, eps):\nN = len(G1.nodes())\nM = len(G2.nodes())\nA = nx.adjacency_matrix(G1).todense()\nB = nx.adjacency_matrix(G2).todense()\nIa = np.ones((N, N))\nIb = np.ones((M, M))\nnsim_prev = np.zeros((M, N))\nnsim = np.ones((M, N))\nfor i in range(max_iter):\n    if np.allclose(nsim, nsim_prev, atol=eps):\n        break\n    nsim_prev = np.copy(nsim)\n    nsim = np.dot(np.dot(B, nsim_prev), A.T) + np.dot(np.dot(B.T, nsim_prev), A) + np.do

Step,Training Loss,Validation Loss
3200,1.3659,1.378473
6400,1.3585,1.377577


33


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12804
    })
})
{'language': 'Python', 'body': "<func> # Python\n# Get the names of the groups a given device should be part of.\n# \n# Args:\n#     device (dcim.models.Device): Device obj\n# \n# Returns:\n#     list: List of group names the device should be part of\nget_host_groups(device):\ngroups = ['global', f'site__{device.site.slug}', f'role__{device.device_role.slug}', f'type__{device.device_type.slug}', f'manufacturer__{device.device_type.manufacturer.slug}']\nif device.platform:\n    groups.append(f'platform__{device.platform.slug}')\nif device.tenant:\n    groups.append(f'tenant__{device.tenant.slug}')\nreturn groups </func>", 'split': 33, '__index_level_0__': 431043, 'input

Step,Training Loss,Validation Loss
3200,1.3773,1.363141
6400,1.3591,1.362092


34


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})
{'language': 'Python', 'body': "<func> # Python\n# Wait until the regex encountered\n#         \nuntil(self, regex):\nlogger.debug('waiting for %s', regex)\nr = re.compile(regex, re.M)\nself.tn.expect([r]) </func>", 'split': 34, '__index_level_0__': 3060123, 'input_ids': [1, 529, 9891, 29958, 396, 5132, 13, 29937, 20340, 2745, 278, 6528, 18169, 13, 29937, 3986, 13, 29305, 29898, 1311, 29892, 6528, 1125, 13, 21707, 29889, 8382, 877, 10685, 292, 363, 1273, 29879, 742, 6528, 29897, 13, 29878, 353, 337, 29889, 12198, 29898, 13087, 29892, 337, 29889, 29924, 29897, 13, 1311, 29889, 6277, 29889, 17854, 4197, 29878, 2314, 1533, 9891, 29958, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

Step,Training Loss,Validation Loss
3200,1.3661,1.360479
6400,1.3496,1.359551


35


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})
{'language': 'Python', 'body': "<func> # Python\n# Verify case comparison operations.\ntestCaseCompareExamples1(self):\ntry:\n    examples = ['Test1', 'test1', 'TEST1', 'tesT1', 'tEst1']\n    cnd = CaseNormalizedDict({k: k for k in examples})\n    for ex in examples:\n        self.assertTrue(ex in cnd)\n    logger.debug('String representation %s', cnd)\n    logger.debug('String representation (items) %s', cnd.items())\n    logger.debug('Raw representation %r', cnd)\n    logger.debug('normalized values %s', cnd['test1'])\n    self.assertTrue(cnd['test1'], examples[-1])\nexcept Exception as e:\n    logger.exception('Failing with %s', str(e))\n    self.fail() </func>", 's

Step,Training Loss,Validation Loss
3200,1.3512,1.360506
6400,1.355,1.35973


37


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', 'split', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})
{'language': 'Python', 'body': "<func> # Python\n# Download and save Wikipedia views dump file\n#         \ndownload_views_file(self, day, hour):\nhours = hour + day * 24\nnow = datetime.now(timezone.utc)\nhours_before_present = now - timedelta(hours=2)\nviews_datetime = hours_before_present + timedelta(hours=-hours)\nfriendly_time = views_datetime.strftime('%m/%d/%Y, %H:00')\nprint('Processing the views file from', friendly_time)\nurl = self.get_pageviews_download_url(views_datetime)\nprint(f'\\tDownloading Wikipedia views hourly data from {url}')\nwith requests.Session() as s:\n    response = s.get(url)\nwith open(self.pageviews_path.format(count=hours), 'wb') as f:\

Step,Training Loss,Validation Loss
3200,1.3606,1.363418
6400,1.3612,1.362562
