<h1>HuggingFace demo</h1>

In [1]:
! nvidia-smi

Thu May  1 10:51:13 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce 940MX           Off |   00000000:01:00.0 Off |                  N/A |
| N/A   48C    P8             N/A /  200W |       5MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
! pip install transformers



In [3]:
import torch

<h1>Hugging Face Tasks</h1>

<h3>I. NLP Tasks</h3>

In [4]:
from transformers import pipeline

In [None]:
"""
1. Text Classification: Assign a catergory to a piece of text.
Sentiment Analysis
Topic Classification
"""

classifier = pipeline('text-classification')

"""
2. Token Classification: Assigning labels to individual tokens in a sequence.
Named Entity Recognition (NER)
Part-of-Speech Tagging
"""
token_classifier = pipeline('token-classification')

"""
3. Question Answering: Extracting an answer from a given context based on a question
"""
question_answerer = pipeline('question-answering')

"""
4. Text Generation: Generating text based on a given prompt.
Language Modeling
Story Generation
"""
text_generator = pipeline('text-generation')

"""
5. Summarization: Condensing long documents into shorter summaries
"""
summarier = pipeline("summarization")

"""
6. Translation: Translating text from one language to another.
"""
text2text_generator = pipeline('text2text-generation')

"""
7. Fill-Mask: Predicting the masked token in a sequence
"""
fill_mask = pipeline('fill-mask')

"""
8. Feature Extraction: Extracting hidden states of features from text.
"""
feature_extractor = pipeline('feature_extraction')

"""
9. Sentence Similarity: Measuring the similarity between two sentences
"""
sentence_similarity = pipeline('sentence-similarity')

<h3>II. Computer Vision Tasks</h3>

In [None]:
"""
1. Image Classification: Classifying the main content of an image.
"""
image_classifier = pipeline('image-classification')

"""
2. Object Detection: Identifying objects within an image and their bounding boxes.
"""
object_detector = pipeline('object-detection')

"""
3. Image Segmentation: Sementing different parts of an image into classes.
"""
image_segmenter = pipeline('image-segmentation')

"""
4. Image Generation: Generating images from textual descriptions (using DALL-E or similar models).
"""

<h3>III. Speech Processing TASKS</h3>

In [None]:
"""
1. Utomatic Speech Recognition (ASR): Convering spoken language into text.
"""
speech_recognizer = pipeline("automatic-speech-recognition")
"""
2. Speech Translation: Translationg spoken language from one language to another.
3. Audio Classification: Classifying audio signals into predefined catergories.
"""

<h3>IV. Multimodal TASKS</h3>

In [None]:
"""
1. Image Captioning: Generating a textual description of an image.
"""
image_captioner = pipeline('image-to-text')

"""
2. Visual Question Answering (VQA): Answering questions about the content of an image.
"""

<h3>V. Other TASKS</h3>

In [None]:
"""
1. Table Question Answering: Answering questions based on tabular data.
"""
table_qa = pipeline("table-question-answering")

"""
2. Document Question Answering: Extracting answers from documents like PDFs.
"""
doc_qa = pipeline("document-question-answring")

"""
3. Time Series Forecasting: Predicting futures values in time series data(not directly supported in the main)
"""

<h3>NLP Tasks</h3>
1. Sentiment Analysis

In [1]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis')

result = classifier('I very very happy with my love')

print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998840093612671}]


In [2]:
pipeline(task="sentiment-analysis", model="facebook/bart-large-mnli")("Everyday lots of LLMs papers are publised about LLMs Evaluation. Lots of them Looks very Promising. I am not sure if we can actually evaluate LLMs. There is still lots to do. Don't you think.")

[{'label': 'neutral', 'score': 0.9725003242492676}]

<h1>Fine tunning IMDB</h1>

In [7]:
! pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [1]:
from datasets import load_dataset
dataset = load_dataset('imdb')

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 187257.42 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 188776.83 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 185854.46 examples/s]


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

<h4>Preprocess Data</h4>

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')



In [5]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 25000/25000 [00:17<00:00, 1441.96 examples/s]
Map: 100%|██████████| 25000/25000 [00:15<00:00, 1651.60 examples/s]
Map: 100%|██████████| 50000/50000 [00:34<00:00, 1457.43 examples/s]


In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})