In [1]:
from transformers import AutoTokenizer, pipeline
import torch
from transformers import pipeline

In [2]:
model_id = "google/gemma-2b"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [4]:
test = tokenizer("Hi", return_tensors="pt")
print(test)

{'input_ids': tensor([[   2, 2151]]), 'attention_mask': tensor([[1, 1]])}


In [4]:
test = tokenizer("Hello, Jerry is a student of the university of Saarland. He studies Computer science.", return_tensors="pt")
print(test)


{'input_ids': tensor([[     2,   4521, 235269,  31656,    603,    476,   5913,    576,    573,
          13435,    576,  96808,   1445, 235265,   1315,   6438,  15963,   8042,
         235265]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [8]:
#Using the pipeline !

classifier = pipeline("sentiment-analysis")
classifier("In this course, we will teach you how to")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'POSITIVE', 'score': 0.9993911981582642}]

In [7]:

# getting the embeddings
from transformers import AutoModel
model = AutoModel.from_pretrained(model_id)
with torch.no_grad():  # Disable gradient calculations to save memory
    outputs = model(**test)



config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# outputs.last_hidden_state contains the embeddings for each token
embeddings = outputs.last_hidden_state
embeddings[0]

tensor([[ 0.3116, -0.6693,  0.1596,  ..., -0.1884,  0.2425,  0.1245],
        [-0.5034,  0.0232, -0.7929,  ...,  0.0115,  0.9474, -0.2379],
        [-0.2977,  0.4621,  0.2103,  ...,  0.5183,  1.1501,  0.0036],
        ...,
        [ 0.6380,  0.2522, -0.1038,  ..., -0.4299,  1.2579,  0.3517],
        [-0.3215, -0.1665,  0.0510,  ..., -0.1017,  0.9959,  0.8404],
        [-0.1254,  0.4106,  0.3781,  ...,  0.7781,  0.8971,  0.7292]])

In [None]:
from torch import tensor

example = "Hello, I am a student of the university of Saarland. I study Computer science."
output_example = "the output"

model_input = tokenizer(example, max_length=8, padding="max_length", truncation=True)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(output_example, max_length=8, padding="max_length", truncation=True)

model_input['labels'] = labels['input_ids']

#print(tokenizer.convert_ids_to_tokens(model_input["labels"]))
#print(model_input)
input = tensor(model_input['input_ids'])
print(tokenizer.decode(model_input['input_ids']))
print(type(tensor(model_input['input_ids'])))

#print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(example)))

### In the following cell, I will test the output of the tokenizer for the thesis model

In [12]:
### testing the tokenizer for the thesis model
from transformers import AutoTokenizer
from datasets import load_dataset
from torch import tensor

model_id = "google/gemma-2b"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [16]:
def tokenize_function(examples):
    instruction = "Complete the following software model by finding the missing part: "
    inputs = [instruction + inp for inp in examples['input']]
    targets = examples['output']
    max_length = 2000
    model_input = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True)

    model_input['labels'] = labels['input_ids']
    return model_input

In [17]:
# Load dataset
#train_dataset_url = "/Users/jerrytakou/University/Thesis/programming/thesis/datasets_for_fine_tuning/structural_removal_non_contiguous/processed_2000train.jsonl"
#test_dataset_url ="/Users/jerrytakou/University/Thesis/programming/thesis/datasets_for_fine_tuning/structural_removal_non_contiguous/processed_2000processed_2000/test.jsonl"
validation_dataset_url ="/Users/jerrytakou/University/Thesis/programming/thesis/datasets_for_fine_tuning/structural_removal_non_contiguous/processed_2000/validation.jsonl"


data_files = {
    #'train': train_dataset_url,
    #'test': test_dataset_url,
    'validation': validation_dataset_url
}

dataset = load_dataset('json', data_files=data_files)
validation_dataset = dataset['validation']

In [18]:
validation_data = validation_dataset.map(tokenize_function, batched=True)
input = tensor(validation_data['input_ids'])
print(tokenizer.decode(input[0]))


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

<eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos>

### In the following cell, I want to see the output of the model when it receives the tokenized input

In [23]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

In [27]:
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
