# HuggingFace link to LeonLLM
- https://huggingface.co/collections/Leon-LLM/leon-llm-chess-6584387dbef870ffa4a7605f for all models and datasets

## Login to use private Repos

In [None]:
"""
Login with your huggingface credentials
Needs a private token to be generated from huggingface (https://huggingface.co/settings/tokens)
Uses ipywidgets to display a login widget (!pip install ipywidgets)
After installing ipywidgets, restart the kernel/program
"""

from huggingface_hub import notebook_login


notebook_login()

# Download Model

In [None]:
"""
Use a model from huggingface, saved in your local machine 
Default cache directory is ~/.cache/huggingface/hub
You can change the cache directory by setting default_cache=False and inputting a cache_dir
"""

huggingface_repo_name = "Leon-LLM/Leon-Chess-Mamba-350k-Plus-Right-Padding"  # Input name of huggingface repo

default_cache = True  # Set to False if you want to use a different cache directory
cache_dir = "./cache/huggingface/hub"  # Input path to cache directory, will be ignored if default_cache=True

if default_cache:
    model = AutoModelForCausalLM.from_pretrained(huggingface_repo_name)
else:
    model = AutoModelForCausalLM.from_pretrained(
        huggingface_repo_name, cache_dir=cache_dir
    )

# Download Dataset

In [None]:
"""
Use a dataset from huggingface, saved in your local machine 
"""

from datasets import load_dataset, Dataset

dataset_name = (
    "Leon-LLM/Leon-Chess-Dataset-19k"  # Input name of dataset from huggingface
)

dataset = load_dataset(dataset_name)

### Save to textfile

In [None]:
text_file = "./data/test.tok"  # Input path of text file to save dataset to


def dataset_to_text_file(dataset, file_path):
    with open(file_path, "w") as f:
        for i in range(len(dataset)):
            f.write(dataset[i]["text"] + "\n")


data = dataset["train"]
dataset_to_text_file(data, text_file)

# Upload

## Upload Model


In [None]:
"""
Upload a model from your local machine (checkpoint_path) to huggingface
Set private=True if you want to upload to a new private repo
"""

from transformers import AutoModelForCausalLM

# checkpoint_path = "/home/ubuntu/LeonLLM/leon-llm/Leon-LLM-Models/xLANplus/Leon-Chess_19k_0001_4E_PLUS/Leon-Chess_19k_0001_4E_PLUS"  # Input name of local checkpoint
# checkpoint_path = "/home/ubuntu/LeonLLMV2/leon-llm/Leon-LLM-Models/V63_GPT2_350k_4E_xLANplus_RIGHT_PAD/checkpoint-70000"  # Input name of local checkpoint
checkpoint_path = "./Leon-LLM-Models/R6_Mamba_71k_4E_xLANplus/R6_Mamba_71k_4E_xLANplus"
huggingface_repo_name = "R6_Mamba_71k_4E_xLANplus"  # Input name of huggingface repo

model = MambaForCausalLM.from_pretrained(checkpoint_path)
model.push_to_hub(huggingface_repo_name, organization="Leon-LLM", private=True)

## Upload Fine-Tuned Model

In [None]:
from peft import get_peft_model
from transformers import AutoModelForCausalLM

model_dir = (
    "Leon-LLM/V63_GPT2_350k_4E_xLANplus_RIGHT_PAD"  # Hugging Face repo_id of base model
)
model = AutoModelForCausalLM.from_pretrained(model_dir)

peft_model_dir = "./Leon-LLM-Models/V66_LoRA_V63_GPT2-350k-Plus_98k_low_elo_4E_r64/V66_LoRA_V63_GPT2-350k-Plus_98k_low_elo_4E_r64"
model.load_adapter(peft_model_dir)

peft_model = get_peft_model(model, model.peft_config["default"])

In [None]:
huggingface_repo_name = (
    "V66_LoRA_V63_GPT2-350k-Plus_98k_low_elo_4E_r64"  # Input name of huggingface repo
)
peft_model.push_to_hub(huggingface_repo_name, organization="Leon-LLM", private=True)

## Upload Dataset

In [22]:
"""
Upload a dataset from your local machine (checkpoint_path) to huggingface
Set private=True if you want to upload to a new private repo
"""

from datasets import load_dataset, Dataset

dataset_name = "Leon-LLM/270k_all_moves_xlan"  # Input name of dataset for huggingface
datset_path = "C:/Users/Jerome/Coding/leon-llm/data/training/all_moves/270k_all_moves_noLong_BOS.tok"  # Input path of dataset to upload
my_dataset = Dataset.from_text(datset_path)

my_dataset.push_to_hub(dataset_name, private=False)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/271 [00:00<?, ?ba/s]