# Fine-tuning a model

In [None]:
from sympy.polys.polyconfig import query
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased")

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

dataset = dataset.map(
    lambda row: tokenizer(row["text"])
)

training_args = TrainingArguments(
    output_dir="./results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test']
)

trainer.train()
local_path = "./fine_tuned_model"

trainer.save_model(local_path)


# Text generation

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

prompt = "the weather is so"

input_ids = tokenizer.encode(prompt, return_tensors="pt")

output = model.generate(input_ids, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

generated_text

'the weather is so bad, I\'m going to go to the beach and swim."\n\nThe man, who asked'

In [7]:
# generate text from image
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image

processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

image = Image.open("elephant.jpeg")

pixel_values = processor(images=image, return_tensors="pt").pixel_values

generated_ids = model.generate(pixel_values=pixel_values, max_length=50)


generated_caption = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True
)

generated_caption[0]

preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/707M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. In the new behavior, if both images and text are provided, the last token (EOS token) of the input_ids and attention_mask tensors will be removed. To test the new behavior, set `legacy=False`as a processor call argument.


model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

'a baby elephant walking through a grassy field.'

# Embeddings

In [8]:
# pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

sentence = "what are embeddings?"

embeddings = embedder.encode([sentence])

embeddings

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

array([[-1.73369551e-03, -8.55376348e-02, -1.51369497e-02,
        -1.05205299e-02,  2.52214298e-02,  7.31167570e-02,
        -2.82051302e-02,  4.29564709e-04,  7.04325959e-02,
        -2.81849056e-02,  2.75691524e-02,  3.37372161e-02,
         4.14264686e-02,  7.12580374e-03, -6.59378916e-02,
         2.29259152e-02,  5.05346842e-02,  6.68734089e-02,
        -8.28059837e-02,  2.64009293e-02, -1.94520783e-02,
        -1.40376315e-02, -6.77151047e-03, -7.95342475e-02,
         6.00138977e-02, -2.32633092e-02, -5.62363267e-02,
         4.81321849e-02,  7.38315284e-02, -3.44408788e-02,
         3.29989232e-02, -3.28794383e-02, -1.85335539e-02,
         5.60518503e-02, -5.12580276e-02,  1.09540708e-01,
         1.84110869e-02,  4.26630164e-03, -7.90110528e-02,
         4.40340163e-03,  1.67858284e-02,  1.99063402e-02,
        -3.60956863e-02,  5.55006899e-02,  6.40486628e-02,
        -3.67567390e-02, -2.81933956e-02, -4.05343622e-02,
        -1.05584592e-01,  2.39179581e-02, -3.28660868e-0

In [9]:
embeddings.shape

(1, 384)

# Semantic search

In [None]:
from sentence_transformers import SentenceTransformer

model_name = "all-MiniLM-L6-v2"
encoder = SentenceTransformer(model_name)

document_embeddings = encoder.encode(documents)

query = "what are the most recent wildlife articles?"

query_embeddings = encoder.encode([query])



In [None]:
from sentence_transformers import util

hits = util.semantic_search(query_embeddings, document_embeddings, top_k=2)