<h1>Fine-tuning and Embeddings</h1>

<h3>1. Load the IMDB dataset:</h3>

In [1]:
from datasets import load_dataset

dataset = load_dataset('imdb')

<h3>2. Select a smaller subset of the dataset to apeed up this demonstration</h3>

In [2]:
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(200))  # Select 200 samples
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(100))    # Select 100 samples

<h3>3. Preprocess the data</h3>

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)  # Limit to 128 tokens

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)

tokenized_test = small_test_dataset.map(tokenize_function, batched=True)

<h3>4. Set up a data collator for dynamic padding:</h3>

In [4]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

<h3>5. Load the Model</h3>

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

<h3>6. Set up training arguments</h3>

In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Smaller batch size can help on CPU
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # Reduce to 1 epoch
    weight_decay=0.01,
    logging_dir='./logs',
)

<h3>7. Initialize the trainer</h3>

In [7]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

<h3>8. Train the model</h3>

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.529444


TrainOutput(global_step=25, training_loss=0.7088493347167969, metrics={'train_runtime': 53.0644, 'train_samples_per_second': 3.769, 'train_steps_per_second': 0.471, 'total_flos': 6623369932800.0, 'train_loss': 0.7088493347167969, 'epoch': 1.0})

<h3>9. Evaluate the Model</h3>

In [9]:
trainer.evaluate()

{'eval_loss': 0.52944415807724,
 'eval_runtime': 6.6935,
 'eval_samples_per_second': 14.94,
 'eval_steps_per_second': 1.942,
 'epoch': 1.0}

<h3>10. Save the Model</h3>

In [10]:
model.save_pretrained('./models/fine_tuned_model')

tokenizer.save_pretrained('./models/fine_tuned_model')

('./models/fine_tuned_model\\tokenizer_config.json',
 './models/fine_tuned_model\\special_tokens_map.json',
 './models/fine_tuned_model\\vocab.txt',
 './models/fine_tuned_model\\added_tokens.json',
 './models/fine_tuned_model\\tokenizer.json')

<h3>11. Loading and using the saved model</h3>

In [11]:
from transformers import pipeline

# Create the classifier
classifier = pipeline(task="sentiment-analysis", model="./models/fine_tuned_model")

# Classify the text

pos_text_example = tokenized_test[0]['text']
results = classifier(inputs=pos_text_example)
print(results)

neg_text_example = tokenized_test[2]['text']
results = classifier(inputs=neg_text_example)
print(results)

[{'label': 'POSITIVE', 'score': 0.9915721416473389}]
[{'label': 'NEGATIVE', 'score': 0.9846765398979187}]
