<a href="https://colab.research.google.com/github/HofstraDoboli/TextMining/blob/main/decoder_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Decoder example for text classification - 20K news

In [1]:
!PYTORCH_CUDA_ALLOC_CONF = max_split_size_mb:128 # or expandable_segments:True

In [None]:
!pip install transformers
!pip install datasets --upgrade
!pip install transformers[torch]
!pip install evaluate
!pip install accelerate

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/TextMining/


In [3]:
# text classification reuters dataset
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
from datasets import Dataset

#train_data    = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle=True)
test_all_data = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), shuffle=True)

#train_ds    = Dataset.from_dict({"text": train_data.data, "label_id": train_data.target})
test_all_ds = Dataset.from_dict({"text": test_all_data.data, "label": test_all_data.target})

In [4]:
text_lengths = [len(text) for text in test_all_ds['text']]

average_length = sum(text_lengths) / len(text_lengths)
min_length = min(text_lengths)
max_length = max(text_lengths)

print(f"Average text length: {average_length:.2f}")
print(f"Minimum text length: {min_length}")
print(f"Maximum text length: {max_length}")

Average text length: 1096.86
Minimum text length: 0
Maximum text length: 158791


In [7]:
# Split the test_all data into validation and testing data
test_all_split = test_all_ds.train_test_split(test_size=0.5, seed=42)

val_ds  = test_all_split['train']
test_ds = test_all_split['test']

# Print the sizes of the new sets
print(f"Validation set size: {len(val_ds)}")
print(f"Testing set size: {len(test_ds)}")

Validation set size: 3766
Testing set size: 3766


In [None]:
def truncate_text(example):
    example['text'] = example['text'][:3000]
    return example

test_ds = test_ds.map(truncate_text)
print(f"Example of truncated text: {test_ds['text'][0][:100]}...")

In [9]:
# detect all labels
labels = test_all_data.target_names
num_labels = len(labels)
print(labels)
print(num_labels)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
20


In [10]:
id2label = {i: name for i, name in enumerate(labels)}
label2id = {name: i for i, name in enumerate(labels)}

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from difflib import get_close_matches # function that gets the string with the closest match
import math, sys, os
import torch

MODEL = "nvidia/Llama-3.1-Minitron-4B-Width-Base" # "google/gemma-3-1b-it" # "google/flan-t5-small"   # try "google/flan-t5-base"
device = 0 if torch.cuda.is_available() else -1

# load tokenizer + model (adjust load flags depending on memory)
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, padding_side = 'left')

# ensure pad token exists for the pipeline
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Try to load in fp16 on GPU; fallback to CPU if not available
try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL,
        device_map="auto",
        dtype= torch.float16,    # requires CUDA + GPU with half precision support
    )
except Exception as e:
    print("FP16 load failed, trying default dtype:", e)
    model = AutoModelForCausalLM.from_pretrained(MODEL, device_map="auto")

# Create a text-generation pipeline (spe)
gen = pipeline("text-generation", model=model, tokenizer=tokenizer)

print(device)

In [5]:
out = gen('How are you doing: Answer')
print(out)

[{'generated_text': "How are you doing: Answered\n\nWhat are some of the things you like to do for fun? What are some of the things you don't like to do? How do you like to spend your time off from school? When you are home on a day off, what are you likely to be doing? How about on the weekend? What do you like to do on the weekend? What do you like to do at night? How about in your free time? How do you like to spend your time when you are not at school or work?\n\nIf you could do anything, what would you want to do? What is the most interesting thing you have ever done? What is the most unusual thing you have ever done? What do you think is the most interesting thing someone has ever done? What is the strangest thing you have ever done? What is the most unusual thing you have ever seen? What is the most unusual thing you have ever eaten? What is the most unusual thing you have ever worn? What is the most unusual thing you have ever done, but you want to do again? What is the most un

In [None]:
import gc
gc.collect()

print("CUDA device:", torch.cuda.current_device())
print(torch.cuda.memory_summary(device=torch.cuda.current_device(), abbreviated=False))
# dtype check
import itertools
print("param dtype:", next(iter(model.parameters())).dtype)

In [13]:
gen_kwargs = dict(
    max_new_tokens = 5,
    do_sample = False,    # greedy
    temperature=0.0,      # deterministic
    top_k=10,              # effectively argmax
    top_p=0.9,
    eos_token_id = None,  # let pipeline handle end; you will need to strip results
    stop_sequence = '\n',
    #max_length = 2048,      # Add max_length to truncate long inputs - does not work with both
    truncation = True,
    padding = True,
    use_cache = False
)

In [14]:
def build_prompt_zero_shot(text, labels):
    labels_str = ", ".join(labels)   # short comma-separated label names
    prompt = (
        "You are a news classifier. Choose exactly one topic from the list below "
        "and respond with exactly that label (nothing else).\n\n"
        f"Labels: {labels_str}\n\n"
        f"Text: {text}\n"
        "Answer:"
    )
    return prompt

prompt = build_prompt_zero_shot(test_ds['text'][0], labels)
#print(prompt)
output = gen(prompt,**gen_kwargs)
print(output[0]['generated_text'])
print('\n\n')
print(output[0]['generated_text'].split('Answer:')[1])

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


You are a news classifier. Choose exactly one topic from the list below and respond with exactly that label (nothing else).

Labels: alt.atheism, comp.graphics, comp.os.ms-windows.misc, comp.sys.ibm.pc.hardware, comp.sys.mac.hardware, comp.windows.x, misc.forsale, rec.autos, rec.motorcycles, rec.sport.baseball, rec.sport.hockey, sci.crypt, sci.electronics, sci.med, sci.space, soc.religion.christian, talk.politics.guns, talk.politics.mideast, talk.politics.misc, talk.religion.misc

Text: 

My experience is when they pound their fists on your back it means "slow down".

Seriously, concentrate on being very smooth, and you will make her
experience much more enjoyable.  Even a normal upshift causes your
passenger to bob, so I ease off the throttle before pulling in the
clutch to eliminate this.  It's more work, but your passenger will
appreciate it!  Also, I've found that using more rear brake than normal
helps keep the bike from diving as much during routine stops, which
makes it much eas

In [15]:
# 6) helper to map output text -> canonical label
def map_to_label(generated_text, label_list):
    # basic cleaning
    s = generated_text.strip().strip('"').strip("'")
    # try exact case-insensitive match
    for lab in label_list:
        if s.lower() == lab.lower():
            return lab
    # try contains any label token
    for lab in label_list:
        if lab.lower() in s.lower():
            return lab
    # fuzzy fallback
    matches = get_close_matches(s, label_list, n=1, cutoff=0.5)
    if matches:
      return matches[0]
    else:
      return None

pred_label = map_to_label(output[0]['generated_text'].split('Answer:')[1], labels)
print(pred_label)

comp.os.ms-windows.misc


In [17]:
import time
def print_mem(tag=""): # displays the current memory usage
    dev = torch.cuda.current_device()
    print(f"[{time.strftime('%H:%M:%S')}] {tag} allocated={torch.cuda.memory_allocated(dev)/(1024**3):.3f} GiB "
          f"reserved={torch.cuda.memory_reserved(dev)/(1024**3):.3f} GiB max={torch.cuda.max_memory_allocated(dev)/(1024**3):.3f} GiB")


In [None]:

preds = []
true_labels = []
batch_size = 4 # This batch_size will now be used by the pipeline internally

# Limit the dataset to the first 32 examples for this evaluation run, as per original code.
eval_dataset_subset = test_ds # range(32))
# Extract true labels for the subset
true_labels.extend([label for label in eval_dataset_subset['label']])

num_batches = math.ceil(len(eval_dataset_subset) / batch_size)

for i in range(num_batches):
    print_mem(f"before batch {i}")
    print('Batch', i+1)
    start_idx = i * batch_size
    end_idx = min(start_idx + batch_size, len(eval_dataset_subset))
    batch =    eval_dataset_subset[start_idx:end_idx]
    prompts = [build_prompt_zero_shot(text, labels) for text in batch["text"]]

    outs = gen(prompts, **gen_kwargs, batch_size=batch_size)

    for output_per_prompt in outs:
        full_generated_text = output_per_prompt[0]["generated_text"]
        if "Answer:" in full_generated_text:
            gen_part = full_generated_text.split("Answer:", 1)[1].strip()
            pred_label = map_to_label(gen_part, labels)
            preds.append(pred_label if pred_label is not None else "UNMAPPED")
        else:
            print("Answer not found in the generated text")
            preds.append("UNMAPPED") # Append UNMAPPED if answer not found

    print_mem(f"after gen batch {i}")
    del outs, prompts, batch
    torch.cuda.empty_cache()
    gc.collect()
    print_mem(f"after cleanup batch {i}")


In [22]:
print(len(test_ds['text'][40]))
# print the length of each text in test_ds between index 40 and 50
print([len(test_ds['text'][i]) for i in range(40, 50)])

226
[226, 91, 0, 158791, 380, 172, 579, 76, 754, 0]


In [None]:
!nvidia-smi -a

In [20]:
print("allocated:", torch.cuda.memory_allocated()/(1024**3), "GiB")
print("reserved: ", torch.cuda.memory_reserved()/(1024**3), "GiB")
print("max_alloc: ", torch.cuda.max_memory_allocated()/(1024**3), "GiB")

allocated: 8.413580417633057 GiB
reserved:  8.462890625 GiB
max_alloc:  9.120858669281006 GiB


In [22]:
print(preds)
print(len(preds))
true_named_labels = [labels[i] for i in true_labels]
print(true_named_labels)

['comp.os.ms-windows.misc', 'UNMAPPED', 'comp.os.ms-windows.misc', 'comp.graphics', 'UNMAPPED', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'alt.atheism', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'UNMAPPED', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'UNMAPPED', 'comp.sys.ibm.pc.hardware', 'rec.sport.baseball', 'comp.sys.ibm.pc.hardware', 'comp.sys.ibm.pc.hardware', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'alt.atheism', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.os.ms-windows.misc', 'comp.sys.mac.hardware', 'comp.os.ms-windows.misc', 'UNMAPPED', 'com

In [23]:
# compute the acuracy of the classifier
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(true_named_labels, preds)
# compute accuracy per class
report = classification_report(true_named_labels, preds)

print(report)

                          precision    recall  f1-score   support

                UNMAPPED       0.00      0.00      0.00         0
             alt.atheism       0.15      0.10      0.12       158
           comp.graphics       0.61      0.42      0.50       198
 comp.os.ms-windows.misc       0.06      0.72      0.11       210
comp.sys.ibm.pc.hardware       0.24      0.29      0.26       181
   comp.sys.mac.hardware       0.82      0.27      0.41       197
          comp.windows.x       0.50      0.02      0.04       197
            misc.forsale       0.00      0.00      0.00       201
               rec.autos       0.08      0.07      0.08       184
         rec.motorcycles       1.00      0.02      0.04       188
      rec.sport.baseball       0.12      0.14      0.13       186
        rec.sport.hockey       0.78      0.10      0.18       205
               sci.crypt       1.00      0.01      0.02       193
         sci.electronics       1.00      0.01      0.02       210
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
import pandas as pd

# Ensure true_named_labels corresponds to the evaluated subset
true_named_labels_subset = [labels[i] for i in eval_dataset_subset['label']]

# Create a DataFrame from the true and predicted labels
predictions_df = pd.DataFrame({
    'True Label': true_named_labels_subset,
    'Predicted Label': preds
})

# Save the DataFrame to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

print("Predictions saved to 'predictions.csv'")
print(predictions_df.head())

Predictions saved to 'predictions.csv'
         True Label          Predicted Label
0   rec.motorcycles  comp.os.ms-windows.misc
1       alt.atheism                 UNMAPPED
2      misc.forsale  comp.os.ms-windows.misc
3     comp.graphics            comp.graphics
4  rec.sport.hockey                 UNMAPPED


In [None]:
# 8) quick display of some examples
for i in range(10):
    print("TEXT:", test_ds['text'][i][:100].replace("\n"," "))
    print("PRED:", preds[i])
    print("GOLD:", labels[true_labels[i]])
    print("---")

TEXT:   My experience is when they pound their fists on your back it means "slow down".  Seriously, concen
PRED: rec.sport.baseball
GOLD: rec.motorcycles
---
TEXT: : In article <C5Mw03.9qr@darkside.osrhe.uoknor.edu>, bil@okcforum.osrhe.edu : > I'd say that what on
PRED: misc.forsale
GOLD: alt.atheism
---
TEXT: 
PRED: alt.atheism
GOLD: misc.forsale
---
TEXT:     The alt.* hierarchie is created for 2 purposes: 1. For groups which do not fit under the comp.* 
PRED: comp.graphics
GOLD: comp.graphics
---
TEXT: Again I assume this is not just flame bait by Roger, but actually a truly held opinion.     Thanks. 
PRED: comp.sys.ibm.pc.hardware
GOLD: rec.sport.hockey
---
TEXT: I.D. Benham, on the Wed, 21 Apr 1993 17:11:39 GMT wibbled: : Hi, :    I'm now in the market for buyi
PRED: comp.sys.ibm.pc.hardware
GOLD: rec.motorcycles
---
TEXT:  Robert McElwaine is the authoritative source of scientific data on Internet. He can be reached alt.
PRED: alt.atheism
GOLD: sci.space
---
TEXT: Elias Davidsson