In [1]:
pip install optuna datasets transformers torch accelerate sentencepiece nltk absl-py rouge_score evaluate huggingface_hub onnx

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting accelerate
  Downloading accelerate-1.5.2-py3-none-any.whl.metadata (19 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting absl-py
  Downloading absl_py-2.2.1-py3-none-any.whl.metadata (2.4 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading d

In [1]:
from huggingface_hub import login

login()  # This will prompt you to enter your Hugging Face token


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import ipywidgets as widgets
from IPython.display import display

# Define available models
model_options = ["Llama 3", "Mistral", "SmolLM"]

# Create a dropdown widget
model_dropdown = widgets.Dropdown(
    options=model_options,
    description="Model:",
    style={'description_width': 'initial'}
)

# Create a confirmation button
confirm_button = widgets.Button(
    description="Confirm Selection",
    button_style="success"
)

# Output widget to display confirmation
output = widgets.Output()

def on_confirm_clicked(b):
    with output:
        output.clear_output()
        selected_model = model_dropdown.value
        print(f"Selected Model: {selected_model}")

# Attach event handler
confirm_button.on_click(on_confirm_clicked)

# Display widgets
display(model_dropdown, confirm_button, output)


Dropdown(description='Model:', options=('Llama 3', 'Mistral', 'SmolLM'), style=DescriptionStyle(description_wi…

Button(button_style='success', description='Confirm Selection', style=ButtonStyle())

Output()

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Function to load the selected model
def load_model(model_name):
    model_map = {
        "Llama 3": "meta-llama/Llama-3.1-8B",
        "Mistral": "mistralai/Mistral-7B-v0.1",
        "SmolLM": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    }

    if model_name in model_map:
        model_id = model_map[model_name]
        print(f"Loading {model_name} ({model_id})...")

        tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", use_auth_token=True)

        print(f"{model_name} loaded successfully!")
        return model, tokenizer
    else:
        print("Invalid model selection.")
        return None, None

# Load the model based on user selection
selected_model = model_dropdown.value
model, tokenizer = load_model(selected_model)


Loading SmolLM (TinyLlama/TinyLlama-1.1B-Chat-v1.0)...




SmolLM loaded successfully!


In [42]:
import pandas as pd
import json
import ipywidgets as widgets
import os , sys
from IPython.display import display

# Output widget for capturing logs
output_widget = widgets.Output()

# File path input widget
file_path_input = widgets.Text(
    placeholder="Enter file path (e.g., /path/to/dataset.csv)",
    description="📂 File:"
)
upload_button = widgets.Button(description="Upload & Process")
display(file_path_input, upload_button, output_widget)  # Display widgets

# Function to handle file processing
def handle_file_upload(_):
    with output_widget:
        output_widget.clear_output()  # Clear previous output
        
        file_path = file_path_input.value.strip()
        if not os.path.exists(file_path):
            print(f"❌ File not found: {file_path}")
            return

        print(f"\n✅ Uploaded file: {file_path}")
        sys.stdout.flush()

        # Process dataset
        process_dataset(file_path)

# Bind button click to function
upload_button.on_click(handle_file_upload)

# Function to process dataset
def process_dataset(file_path):
    with output_widget:
        try:
            df = pd.read_csv(file_path)
            
            # Keep only relevant columns
            required_columns = ['description', 'essay']
            df = df[required_columns].dropna()
            
            print("\n✅ Dataset successfully loaded and filtered")
            sys.stdout.flush()
            
            # ✅ Train-Validation Split (80% Train, 20% Validation)
            df = df.sample(frac=1, random_state=42)  # Shuffle dataset
            train_size = int(0.8 * len(df))
            train_df, val_df = df[:train_size].copy(), df[train_size:].copy()

            print(f"\n📊 Dataset split: {len(train_df)} train / {len(val_df)} validation")
            
            # Preprocess dataset
            train_df = preprocess_for_llm(train_df)
            val_df = preprocess_for_llm(val_df)

            # Save processed datasets
            train_path = "train_dataset.json"
            val_path = "val_dataset.json"
            train_df.to_json(train_path, orient="records", lines=True)
            val_df.to_json(val_path, orient="records", lines=True)

            print(f"\n✅ Processed datasets saved: `{train_path}` (Train), `{val_path}` (Validation)")

        except Exception as e:
            print(f"❌ Error processing file: {e}")

# Preprocessing (LLM-based)
def preprocess_for_llm(df):
    df['description'] = df['description'].apply(lambda x: x.lower() if isinstance(x, str) else x)
    df['essay'] = df['essay'].apply(lambda x: x.lower() if isinstance(x, str) else x)
    
    max_length = 512
    df['description'] = df['description'].apply(lambda x: x[:max_length] if isinstance(x, str) else x)
    df['essay'] = df['essay'].apply(lambda x: x[:max_length] if isinstance(x, str) else x)
    
    print("\n✨ Preprocessed Dataset Sample:")
    display(df.head())
    
    return df


Text(value='', description='📂 File:', placeholder='Enter file path (e.g., /path/to/dataset.csv)')

Button(description='Upload & Process', style=ButtonStyle())

Output()

In [43]:
import torch
import optuna
import ipywidgets as widgets
from IPython.display import display
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
import os

In [44]:
# Suggested default parameters based on the selected model
model_recommendations = {
    "Llama 3": {"learning_rate": 2e-5, "batch_size": 8, "num_train_epochs": 3, "weight_decay": 0.01},
    "Mistral": {"learning_rate": 3e-5, "batch_size": 16, "num_train_epochs": 4, "weight_decay": 0.02},
    "SmolLM": {"learning_rate": 5e-5, "batch_size": 8, "num_train_epochs": 3, "weight_decay": 0.01},
}
# Set selected model parameters (assuming 'selected_model' is defined earlier)
selected_model_params = model_recommendations.get(selected_model, model_recommendations["Llama 3"])


In [45]:
# Define processed dataset paths
train_path = "train_dataset.json"
val_path = "val_dataset.json"

# Ensure datasets are processed before loading
if os.path.exists(train_path) and os.path.exists(val_path):
    dataset = load_dataset("json", data_files={"train": train_path, "validation": val_path})


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [48]:
# Tokenization function
tokenized_dataset = dataset.map(lambda x: tokenizer(x["essay"], truncation=True, padding="max_length", max_length=512), batched=True, remove_columns=["essay"])

print("Tokenized dataset features:", tokenized_dataset["train"].column_names)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


Map:   0%|          | 0/1788 [00:00<?, ? examples/s]

Map:   0%|          | 0/447 [00:00<?, ? examples/s]

Tokenized dataset features: ['description', 'input_ids', 'attention_mask']


In [49]:
import ipywidgets as widgets
from IPython.display import display

def on_confirm_selection(button):
    global final_params
    final_params = {
        "learning_rate": learning_rate_widget.value,
        "batch_size": batch_size_widget.value,
        "num_train_epochs": epochs_widget.value,
        "weight_decay": weight_decay_widget.value,
    }
    print(f"Confirmed Hyperparameters: {final_params}")

# User input widgets for parameter selection
learning_rate_widget = widgets.FloatText(value=selected_model_params["learning_rate"], description="Learning Rate:")
batch_size_widget = widgets.Dropdown(options=[2, 4, 8, 16, 24, 32, 64], value=selected_model_params["batch_size"], description="Batch Size:")
epochs_widget = widgets.IntSlider(min=2, max=50, value=selected_model_params["num_train_epochs"], description="Epochs:")
weight_decay_widget = widgets.FloatText(value=selected_model_params["weight_decay"], description="Weight Decay:")

# Confirmation button
confirm_button = widgets.Button(description="Confirm Selection", button_style='success')
confirm_button.on_click(on_confirm_selection)

display(learning_rate_widget, batch_size_widget, epochs_widget, weight_decay_widget, confirm_button)

FloatText(value=5e-05, description='Learning Rate:')

Dropdown(description='Batch Size:', index=2, options=(2, 4, 8, 16, 24, 32, 64), value=8)

IntSlider(value=3, description='Epochs:', max=50, min=2)

FloatText(value=0.01, description='Weight Decay:')

Button(button_style='success', description='Confirm Selection', style=ButtonStyle())

In [50]:
import optuna

# Function to check if user-selected parameters match recommended ones
def user_selected_defaults():
    return (
        final_params["learning_rate"] == selected_model_params["learning_rate"] and
        final_params["batch_size"] == selected_model_params["batch_size"] and
        final_params["num_train_epochs"] == selected_model_params["num_train_epochs"] and
        final_params["weight_decay"] == selected_model_params["weight_decay"]
    )

# Function to optimize hyperparameters using Optuna
def objective(trial):
    """Hyperparameter tuning objective function using real model validation loss."""
    
    # Suggest hyperparameters within reasonable ranges
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 5e-4, log=True)
    batch_size = trial.suggest_int("batch_size", 4, 32, step=4)
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 10)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)

    # Update training arguments dynamically
    training_args.learning_rate = learning_rate
    training_args.per_device_train_batch_size = batch_size
    training_args.num_train_epochs = num_train_epochs
    training_args.weight_decay = weight_decay

    # Create a new trainer instance with updated hyperparameters
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset.get("validation", None),  # Ensure validation dataset is passed
        data_collator=data_collator,
    )

    # Train for a single epoch and evaluate
    trainer.train()
    eval_results = trainer.evaluate()
    
    eval_loss = eval_results["eval_loss"]  # Get real validation loss
    print(f"Trial {trial.number}: Eval Loss = {eval_loss}, Params = {trial.params}")

    return eval_loss  # Minimize real validation loss


# Run Optuna only if user-selected parameters match the recommended ones
if user_selected_defaults():
    print("✅ User selected default hyperparameters. Running Optuna optimization...")
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=10)  # Run Optuna with real model evaluation
        
        # Get best hyperparameters
    best_params = study.best_params
    final_params = best_params
        
    print("Best Hyperparameters Found:", final_params)
else:
    print("⚠ User customized hyperparameters. Skipping Optuna optimization.")
    print("📌 Customized Hyperparameters:", final_params )
    print("Started Training")
    training_args = TrainingArguments(
        output_dir="./fine_tuned_model",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=final_params["learning_rate"],
        per_device_train_batch_size=final_params["batch_size"],
        num_train_epochs=final_params["num_train_epochs"],
        weight_decay=final_params["weight_decay"],
        save_total_limit=1,
        logging_dir="./logs",
        logging_steps=100,
        fp16=True,  # Disable fp16 if unstable  # Mixed precision training if GPU is available
        report_to="none",
        gradient_checkpointing=True,
        gradient_accumulation_steps=4  
    )
        # Define the trainer
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset.get("validation", None),  # Ensure validation dataset is passed
        data_collator=data_collator,
    )
    trainer.train()

⚠ User customized hyperparameters. Skipping Optuna optimization.
📌 Customized Hyperparameters: {'learning_rate': 5e-05, 'batch_size': 2, 'num_train_epochs': 2, 'weight_decay': 0.01}
Started Training


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
1,3.0314,3.516302


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


In [80]:
import math
import torch
from evaluate import load


def compute_perplexity(eval_loss):
    return math.exp(eval_loss)

# Get validation loss from the Trainer
eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
perplexity = compute_perplexity(eval_loss)

print(f"Validation Loss: {eval_loss:.4f}")
print(f"Perplexity (PPL): {perplexity:.4f}")

Validation Loss: 3.5163
Perplexity (PPL): 33.6597


In [81]:
import json
import torch
from evaluate import load

# Load evaluation metrics
bleu = load("bleu")
rouge = load("rouge")

# Load validation dataset
val_data = []
with open("val_dataset.json", "r", encoding="utf-8") as f:
    for line in f:
        val_data.append(json.loads(line.strip()))

# Extract inputs and references
sample_inputs = [item["essay"] for item in val_data]
references = [[item["description"]] for item in val_data]  # Ensure correct BLEU format

# Function to generate text
def generate_text(input_text):
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")  # Move input to GPU
    model.to("cuda")  # Move model to GPU
    
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=50)  # Set max_new_tokens instead
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Generate predictions
predictions = [generate_text(text) for text in sample_inputs]

# Compute BLEU score
bleu_score = bleu.compute(predictions=predictions, references=references)

# Compute ROUGE score
rouge_score = rouge.compute(predictions=predictions, references=[ref[0] for ref in references])

# Print scores
print(f"BLEU Score: {bleu_score['bleu']:.4f}")
print(f"ROUGE Score: {rouge_score}")


BLEU Score: 0.0032
ROUGE Score: {'rouge1': np.float64(0.09195957306972727), 'rouge2': np.float64(0.010081981236581005), 'rougeL': np.float64(0.06903890994649176), 'rougeLsum': np.float64(0.06906421498886611)}


In [82]:
def calculate_token_accuracy(predictions, references, tokenizer):
    total_tokens = 0
    correct_tokens = 0

    for pred, ref in zip(predictions, references):
        pred_tokens = tokenizer.tokenize(pred)  # Tokenize prediction
        ref_tokens = tokenizer.tokenize(ref[0])  # Tokenize reference (flatten list)

        # Compute number of matching tokens
        matches = sum(1 for p, r in zip(pred_tokens, ref_tokens) if p == r)

        correct_tokens += matches
        total_tokens += len(ref_tokens)

    # Avoid division by zero
    accuracy = (correct_tokens / total_tokens) * 100 if total_tokens > 0 else 0
    return accuracy

# Compute Token-Level Accuracy
token_accuracy = calculate_token_accuracy(predictions, references, tokenizer)

# Print the result
print(f"Token-Level Accuracy: {token_accuracy:.2f}%")

Token-Level Accuracy: 1.19%


In [57]:
import json
import torch
from evaluate import load

# Load the original pre-trained model (before fine-tuning)
original_model,original_tokenizer  = load_model(selected_model)

# Load evaluation metrics
bleu = load("bleu")
rouge = load("rouge")

# Load validation dataset
val_data = []
with open("val_dataset.json", "r", encoding="utf-8") as f:
    for line in f:
        val_data.append(json.loads(line.strip()))

# Extract inputs and references
sample_inputs = [item["essay"] for item in val_data]
references = [[item["description"]] for item in val_data]  # Ensure correct BLEU format

# Function to generate text
def generate_text(input_text):
    inputs = original_tokenizer(input_text, return_tensors="pt").to("cuda")  # Move input to GPU
    original_model.to("cuda")  # Move model to GPU
    
    with torch.no_grad():
        output = original_model.generate(**inputs, max_new_tokens=50)  # Set max_new_tokens instead
    return original_tokenizer.decode(output[0], skip_special_tokens=True)

# Generate predictions
predictions = [generate_text(text) for text in sample_inputs]

# Compute BLEU score
bleu_score = bleu.compute(predictions=predictions, references=references)

# Compute ROUGE score
rouge_score = rouge.compute(predictions=predictions, references=[ref[0] for ref in references])

# Print scores
print(f"BLEU Score: {bleu_score['bleu']:.4f}")
print(f"ROUGE Score: {rouge_score}")


Loading SmolLM (TinyLlama/TinyLlama-1.1B-Chat-v1.0)...




SmolLM loaded successfully!
BLEU Score: 0.0035
ROUGE Score: {'rouge1': np.float64(0.09475974087888797), 'rouge2': np.float64(0.010959319707140786), 'rougeL': np.float64(0.07189118712644443), 'rougeLsum': np.float64(0.07405909765944121)}


In [84]:
from huggingface_hub import HfApi

api = HfApi()


In [85]:
import ipywidgets as widgets
from IPython.display import display
from huggingface_hub import create_repo

def create_hf_repo(button):
    hf_token = token_input.value
    repo_name = repo_input.value
    
    if not hf_token or not repo_name:
        output.clear_output()
        with output:
            print("Please enter both HF Token and Repo Name.")
        return
    
    try:
        create_repo(repo_name, token=hf_token, private=False, exist_ok=True, repo_type="model")
        output.clear_output()
        with output:
            print(f"Repository '{repo_name}' created successfully!")
    except Exception as e:
        output.clear_output()
        with output:
            print(f"Error: {e}")

token_input = widgets.Password(
    description='HF Token:',
    placeholder='Enter your Hugging Face token'
)

repo_input = widgets.Text(
    description='Repo Name:',
    placeholder='Enter repository name'
)

confirm_button = widgets.Button(
    description="Confirm",
    button_style='success',
    tooltip="Click to create repository"
)

output = widgets.Output()

confirm_button.on_click(create_hf_repo)

display(token_input, repo_input, confirm_button, output)


Password(description='HF Token:', placeholder='Enter your Hugging Face token')

Text(value='', description='Repo Name:', placeholder='Enter repository name')

Button(button_style='success', description='Confirm', style=ButtonStyle(), tooltip='Click to create repository…

Output()

In [88]:
model.push_to_hub(repo_input.value, token=HF_TOKEN)
tokenizer.push_to_hub(repo_input.value, token=HF_TOKEN)

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tomoe007/test/commit/c4203675e3d23e4079e2269e924c9d26fab1c9f4', commit_message='Upload tokenizer', commit_description='', oid='c4203675e3d23e4079e2269e924c9d26fab1c9f4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tomoe007/test', endpoint='https://huggingface.co', repo_type='model', repo_id='tomoe007/test'), pr_revision=None, pr_num=None)

In [79]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = repo_input.value

# Load model and tokenizer
tokenizer = tokenizer
model = model

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Example usage
input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors="pt")

# Move inputs to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}

# Generate output
outputs = model.generate(**inputs)

# Decode and print
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Hello, how are you? i’m a student at the university of california, loma california.
