<a href="https://colab.research.google.com/github/HofstraDoboli/TextMining/blob/main/encoder_classification_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install -U "transformers[torch]" datasets evaluate accelerate

In [2]:
# text classification reuters dataset
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
from datasets import Dataset

train_data    = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle=True)
test_all_data = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), shuffle=True)

train_ds    = Dataset.from_dict({"text": train_data.data, "label_id": train_data.target})
test_all_ds = Dataset.from_dict({"text": test_all_data.data, "label_id": test_all_data.target})

In [7]:
print(dir(train_data)) # data = text, target = number between 0 and 19
print(max(train_data.target)) # numeric label, train.target_names = actual name of the data
print(train_data.data[0]) # the actual text
print(len(train_data.data)) # length of the data - number of samples
print(type(train_data.data))

['DESCR', 'data', 'filenames', 'target', 'target_names']
19
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.
11314
<class 'list'>


In [3]:
# Split the test_all data into validation and testing data
test_all_split = test_all_ds.train_test_split(test_size=0.5, seed=42)

val_ds  = test_all_split['train']
test_ds = test_all_split['test']

# Print the sizes of the new sets
print(f"Validation set size: {len(val_ds)}")
print(f"Testing set size: {len(test_ds)}")

Validation set size: 3766
Testing set size: 3766


In [4]:
# detect all labels
labels = train_data.target_names
num_labels = len(labels)
print(labels)
print(num_labels)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
20


In [5]:
id2label = {i: name for i, name in enumerate(labels)}
label2id = {name: i for i, name in enumerate(labels)}


In [None]:
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoConfig, TrainingArguments, Trainer
import numpy as np
import evaluate

MODEL_NAME = "distilbert/distilbert-base-uncased"   # <-- replace with "bert-base", "albert-base-v2", "distilbert/distilbert-base-cased", "roberta-base", "microsoft/deberta-v3-base", etc.
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True) # use_fast = uses a tokenizer written in Rust, much faster than the default one in Transformers

# Configure model for
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=num_labels)
config.id2label = id2label
config.label2id = label2id
print(config)

In [None]:
max_length = 256
def preprocess(data):
    return tokenizer(data["text"], truncation=True) # max_length = max_length)

train_token = train_ds.map(preprocess, batched=True) # remove_columns = "text") # batch_size=64,
val_token   = val_ds.map(preprocess, batched=True) # remove_columns = "text")
test_token  = test_ds.map(preprocess, batched=True) #remove_columns = "text")

In [9]:
# rename column
train_token = train_token.rename_column("label_id", "labels")
val_token = val_token.rename_column("label_id", "labels")
test_token = test_token.rename_column("label_id", "labels")

In [10]:
val_token

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 3766
})

In [16]:
print(val_token['input_ids'][0][:20])
print(val_token['input_ids'][0][-20:])

print(val_token['attention_mask'][0][:20])
print(val_token['attention_mask'][0][-20:])

[101, 1011, 1008, 1011, 1011, 1011, 1011, 2521, 2013, 2108, 1000, 7463, 2041, 1010, 1000, 1996, 24131, 2024, 2579, 1010]
[2515, 1996, 3793, 3073, 3350, 1029, 1996, 18707, 1997, 1996, 8771, 1997, 1996, 24404, 18774, 2072, 1010, 3839, 1997, 102]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
# 4) Metrics
import numpy as np
import evaluate

# Load metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision_macro": precision.compute(predictions=preds, references=labels, average="macro")["precision"],
        "recall_macro": recall.compute(predictions=preds, references=labels, average="macro")["recall"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "precision_weighted": precision.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall_weighted": recall.compute(predictions=preds, references=labels, average="weighted")["recall"],
        "f1_weighted": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }


In [22]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    # ðŸ“‚ Directory where all checkpoints, logs, and the final model are saved
    output_dir = "out",

    # How often to evaluate on the validation set
    # "epoch" â†’ run evaluation after each full pass over the training data
    eval_strategy = "epoch",

    # When to save model checkpoints
    # "epoch" â†’ save after each epoch; can also be "steps" to save every N steps
    save_strategy="epoch",

    # Initial learning rate for AdamW optimizer
    learning_rate=2e-5,   # typical for BERT/DeBERTa/RoBERTa fine-tuning

    # Training and evaluation batch sizes (per device, not total)
    per_device_train_batch_size = 32,
    per_device_eval_batch_size  = 64,

    # Number of epochs (full dataset passes)
    num_train_epochs = 3,

    # Weight decay (L2 regularization on model parameters)
    # helps prevent overfitting
    weight_decay = 0.01,

    # Automatically reload the best model checkpoint at the end of training
    load_best_model_at_end = True,

    # Which metric to monitor for best-model selection
    # Should match one of the keys returned by `compute_metrics`
    metric_for_best_model = "f1_macro",

    # âš¡ Use 16-bit (half-precision) floating point on GPUs that support it
    # Greatly speeds up training and reduces memory usage
    fp16 = True,

    # Whether to push checkpoints to the Hugging Face Hub automatically
    push_to_hub = False,

    # Disable logging to Weights & Biases and other services
    report_to = "none",
)

# New Section

In [18]:
# use datacollator - faster
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

In [18]:
# load the model
config.problem_type = "single_label_classification"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# 6) Trainer
from transformers import Trainer, AutoTokenizer

trainer = Trainer(
    # The model to fine-tune
    # This should be an instance of a model class such as AutoModelForSequenceClassification
    model= model,

    # All hyperparameters and training settings
    # This includes learning rate, batch size, number of epochs, fp16, etc.
    args = training_args,

    # Dataset used for training
    # Should be a Hugging Face Dataset (or DatasetDict["train"])
    train_dataset = train_token,

    # ðŸ§ª Dataset used for evaluation
    # The Trainer automatically runs evaluation at the intervals defined in TrainingArguments
    eval_dataset = val_token,

    # Tokenizer associated with your model
    # Ensures that new data or predictions can be encoded/decoded correctly
    # Use processing_class instead of tokenizer in newer versions of transformers
    processing_class = tokenizer, #AutoTokenizer,

    # Function that computes evaluation metrics (accuracy, f1, etc.)
    # The Trainer calls this after each evaluation step and logs the results
    compute_metrics=compute_metrics,
)

In [21]:
# 7) Train
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Weighted,Recall Weighted,F1 Weighted
1,No log,1.013361,0.703133,0.684195,0.685142,0.677552,0.698435,0.703133,0.69496
2,0.677800,1.025231,0.704992,0.693955,0.687472,0.684617,0.706855,0.704992,0.699998
3,0.485300,1.013008,0.708444,0.697068,0.692654,0.692173,0.709277,0.708444,0.70624


TrainOutput(global_step=1062, training_loss=0.5712700859975006, metrics={'train_runtime': 278.2826, 'train_samples_per_second': 121.97, 'train_steps_per_second': 3.816, 'total_flos': 4471704649125120.0, 'train_loss': 0.5712700859975006, 'epoch': 3.0})

In [19]:
# save the best model
%cd '/content/drive/MyDrive/TextMining'
%ls

/content/drive/MyDrive/TextMining
 baseline_sample_imdb_nb_svm.ipynb      labels_train.pickle
 bert_sentiment_class.ipynb             movie_clustering_agglomerative.ipynb
 [0m[01;34mBestModel_20news[0m/                      movie_clustering.ipynb
 bulletin_hofstra_extract_html.ipynb    movie_clustering_large.ipynb
 counter_movies_F24.npz                 movie_clustering_notebook.ipynb
 count_vectorizer_movies_F24.pkl        movie_clustering_notebook_start.ipynb
 count_vect_top20                       movie_embed.npy
 count_vect_top20.pkl                   movie_feedback_retrieval.ipynb
 [01;34mDataSets[0m/                              naive_bayes_20_news.ipynb
 demo_spacy.ipynb                       naive_bayes_sentiment_class.ipynb
 encoder_classification_example.ipynb   naive_bayes_start.ipynb
 example_tf_idf.py                      notebook1_nlp_intro.ipynb
 faiss.ipynb                            page_rank.ipynb
 feedback_retrieval_class.ipynb         probabilistic.ipynb
 file_

In [28]:
trainer.save_model("BestModel_20news/best_model")

In [20]:
%ls "BestModel_20news/best_model"

config.json        special_tokens_map.json  tokenizer.json     vocab.txt
model.safetensors  tokenizer_config.json    training_args.bin


In [21]:
# load saved model
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("BestModel_20news/best_model")
tokenizer = AutoTokenizer.from_pretrained("BestModel_20news/best_model")

In [None]:
eval_test = trainer.evaluate(test_token)

In [25]:
eval_test

{'eval_loss': 1.0916513204574585,
 'eval_model_preparation_time': 0.0019,
 'eval_accuracy': 0.6999468932554435,
 'eval_precision_macro': 0.6922633096542422,
 'eval_recall_macro': 0.6871934053776215,
 'eval_f1_macro': 0.6867454716417745,
 'eval_precision_weighted': 0.7037363387876138,
 'eval_recall_weighted': 0.6999468932554435,
 'eval_f1_weighted': 0.6990686497621589,
 'eval_runtime': 9.4774,
 'eval_samples_per_second': 397.367,
 'eval_steps_per_second': 6.225}

In [30]:
from transformers import pipeline

# Load inference pipeline directly
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

#text = "Carlos Alcaraz stays perfect at ATP Finals with thrilling win over Taylor Fritz."
#text = "Carolina Hurricanes defenseman Charles Alexis Legault is recovering from surgery to repair lacerations to his hand from a skate blade, the team said Tuesday."
text = "What should i buy, a PC or a mac?"
result = classifier(text)
print(result)


Device set to use cuda:0


[{'label': 'comp.sys.mac.hardware', 'score': 0.6038538813591003}]


In [None]:

# 8) Save final model
trainer.save_model("final_model")

In [29]:
id2label

{0: 'alt.atheism',
 1: 'comp.graphics',
 2: 'comp.os.ms-windows.misc',
 3: 'comp.sys.ibm.pc.hardware',
 4: 'comp.sys.mac.hardware',
 5: 'comp.windows.x',
 6: 'misc.forsale',
 7: 'rec.autos',
 8: 'rec.motorcycles',
 9: 'rec.sport.baseball',
 10: 'rec.sport.hockey',
 11: 'sci.crypt',
 12: 'sci.electronics',
 13: 'sci.med',
 14: 'sci.space',
 15: 'soc.religion.christian',
 16: 'talk.politics.guns',
 17: 'talk.politics.mideast',
 18: 'talk.politics.misc',
 19: 'talk.religion.misc'}