In [1]:
!pip install datasets ipywidgets
!pip install git+https://github.com/huggingface/transformers.git
!pip install accelerate

import ipywidgets as widgets
import pandas as pd
import json
import torch

from datasets import load_dataset

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-n3ztcvv_
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-n3ztcvv_
  Resolved https://github.com/huggingface/transformers.git to commit 8c12690cecbb97e187861e386f7a0ac790e4236c
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.0.dev0)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m2.2 MB/s[0m eta 

In [None]:
# Upload the mergedJson.jsonl zip file, to preserve contents
!unzip /content/mergedJson.jsonl.zip
!sha1sum /content/mergedJson.jsonl
# Should be 7b0fa963ec592bd5e8939ede1a6ab478a6104eae

In [None]:
with open("mergedJson.jsonl", "r") as jsonl_file:
    lines = jsonl_file.readlines()

data_list = []
for line in lines:
    data = json.loads(line)
    data_list.append(data)

# dataframe = pd.DataFrame(data_list)
dataframe = pd.DataFrame.from_dict(data_list, orient='columns')

print(dataframe)

In [None]:
dataframe

# dataframe[0] = Description
# dataframe[1] = Data

In [None]:
# @title Starcoder
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
# checkpoint = "bigcode/starcoder2-7b" # Took an awfully long time to download (15mb/s)
checkpoint = "bigcode/starcoder2-3b" # Crashing colab, OOM

device = "cuda" # or cuda for gpu
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16).to("cuda")

training_args = TrainingArguments(
    output_dir="./_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=400,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataframe,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
# @title Phi-2/stability-code
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("stabilityai/stable-code-3b", torch_dtype="auto", trust_remote_code=True).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("stabilityai/stable-code-3b", trust_remote_code=True)

training_args = TrainingArguments(
    output_dir="./_results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    save_steps=400,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataframe,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
model = None
with torch.no_grad():
    torch.cuda.empty_cache()

In [None]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

training_args = TrainingArguments(
    output_dir="./_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataframe,
    tokenizer=tokenizer,
)

trainer.train()