In [None]:
import transformers

Domain Adaptation starts here:
Drawn fully from Hugging Face's course: https://huggingface.co/course/chapter7/3?fw=pt

In [None]:
###working here with the not cleaned text for DistilBERT comparison
with open("Sun_Also_Rises.txt") as file_object:
  text2 = file_object.read()

print('The text has {:,} separate items'.format(len(text2)))


In [None]:
first_half2 = text2[:91462]
second_half2 = text2[91463:]

In [None]:
f = open("first_half_unclean.txt", "w")
f.write(str(first_half2))
f.close()

In [None]:
### For the pre-training of DistilBert, let's get the first half into a column format
test=text2.split()

In [None]:
n = 5
# using list comprehension
result = [test[i:i + n] for i in range(0, len(test), n)]
print(result)

In [None]:
### Also clean up commas and add a "text" column heading
import csv
with open("SAR_unclean.csv", "w") as f:
    wr = csv.writer(f)
    wr.writerows(result)

In [None]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import torch

In [None]:
from huggingface_hub import notebook_login

notebook_login()


In [None]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

In [None]:
###Run a test on it
text = "This is a lovely [MASK]."

In [None]:
### running this to check if DistilBERT is working
inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

In [None]:
from datasets import load_dataset
ds = load_dataset('csv', data_files="SAR_unclean.csv")
ds

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = ds.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

In [None]:
chunk_size = 128

In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
### These sizes depend on the number of grouped rows available.
train_size = 904
test_size = 100

In [None]:
## So will need to train_test_split the data to get a train and test set: ....
first_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
first_dataset

In [None]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(first_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"DistilBERT_Hemingway_SAR", 
    overwrite_output_dir = True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=False,
    logging_steps=logging_steps,
)

In [None]:
from transformers import Trainer

In [None]:
### Be sure to get your git-lfs in place first or warnings will appear
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=first_dataset["train"],
    eval_dataset=first_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
### evaluate the perplexity of the pretrained model
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
### actually run the trainer
trainer.train()

In [None]:
## and seeing how it does after training:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
### Saving a model
from transformers import BertModel

In [None]:
trainer.save_model("./DistilBERT_Hemingway_SAR")

In [None]:
tokenizer.save_pretrained("./DistilBERT_Hemingway_SAR")