## Using Hugging Face Hub

In [1]:
from huggingface_hub import HfApi

In [2]:
api = HfApi()

In [3]:
list(api.list_models())

[ModelInfo(id='perplexity-ai/r1-1776', author=None, sha=None, created_at=datetime.datetime(2025, 2, 18, 0, 13, 5, tzinfo=datetime.timezone.utc), last_modified=None, private=False, disabled=None, downloads=6438, downloads_all_time=None, gated=None, gguf=None, inference=None, inference_provider_mapping=None, likes=1369, library_name=None, tags=['safetensors', 'deepseek_v3', 'custom_code', 'base_model:deepseek-ai/DeepSeek-R1', 'base_model:finetune:deepseek-ai/DeepSeek-R1', 'license:mit', 'region:us'], pipeline_tag=None, mask_token=None, card_data=None, widget_data=None, model_index=None, config=None, transformers_info=None, trending_score=1369, siblings=None, spaces=None, safetensors=None, security_repo_status=None),
 ModelInfo(id='deepseek-ai/DeepSeek-R1', author=None, sha=None, created_at=datetime.datetime(2025, 1, 20, 3, 46, 7, tzinfo=datetime.timezone.utc), last_modified=None, private=False, disabled=None, downloads=4351443, downloads_all_time=None, gated=None, gguf=None, inference=No

## 
The Hugging Face Hub provides a nice user interface for searching for models and learning more about them. At times, you may find it convenient to be able to do the same thing without leaving the development environment. Fortunately, Hugging Face also provides a Python package which allows you to find models through code.



In [3]:
from huggingface_hub import HfApi

api = HfApi()

# List models with text classification filter
models = api.list_models(
    filter="text-classification",  # Use string-based filtering (Other option to try might be )
    sort="downloads",
    direction=-1,  # Descending order
    limit=5
)

modelList = list(models)

# Ensure the list is not empty before accessing elements
if modelList:
    print(modelList[0])
else:
    print("No models found for text classification.")


ModelInfo(id='cross-encoder/ms-marco-MiniLM-L-6-v2', author=None, sha=None, created_at=datetime.datetime(2022, 3, 2, 23, 29, 5, tzinfo=datetime.timezone.utc), last_modified=None, private=False, disabled=None, downloads=12310426, downloads_all_time=None, gated=None, gguf=None, inference=None, inference_provider_mapping=None, likes=74, library_name='transformers', tags=['transformers', 'pytorch', 'jax', 'safetensors', 'bert', 'text-classification', 'license:apache-2.0', 'autotrain_compatible', 'endpoints_compatible', 'region:us'], pipeline_tag='text-classification', mask_token=None, card_data=None, widget_data=None, model_index=None, config=None, transformers_info=None, trending_score=None, siblings=None, spaces=None, safetensors=None, security_repo_status=None)


## Saving a Model locally

In [3]:
from transformers import AutoModel

modelId = "distilbert-base-uncased-finetuned-sst-2-english"

# Instantiate the AutoModel class
model = AutoModel.from_pretrained(modelId)


# Save the model to a local directory
model.save_pretrained(save_directory=f"models/{modelId}")


## Using the model on single text

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import os

# Define paths
model_id = "distilbert-base-uncased-finetuned-sst-2-english"
save_directory = f"models/{model_id}"

# Check if the model directory exists
if not os.path.exists(save_directory):
    print(f"Model directory not found. Downloading model {model_id}...")
    
    # Create the models directory if it doesn't exist
    os.makedirs("models", exist_ok=True)
    
    # Download model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_id)
    model.save_pretrained(save_directory)
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.save_pretrained(save_directory)
    
    print("Model and tokenizer downloaded successfully!")
else:
    print(f"Using locally saved model from {save_directory}")

# Load model and tokenizer from local directory
model = AutoModelForSequenceClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Create a sentiment analysis pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Perform sentiment analysis
text = "Initially, I thought of Hugging Face is just a model hosting platform, but as I explored further, I realized it was so much more."
result = classifier(text)
print(result)

Using locally saved model from models/distilbert-base-uncased-finetuned-sst-2-english


Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.8700191378593445}]


## Inspecting datasets
The datasets on Hugging Face range in terms of size, information, and features. Therefore it's beneficial to inspect it before committing to loading a dataset into your environment.

Let's inspect the "wikidata_extract" dataset.

In [14]:
# Load the module
from datasets import load_dataset_builder

# Create the dataset builder
dataset_builder = load_dataset_builder("McAuley-Lab/Amazon-Reviews-2023",  'raw_meta_All_Beauty')

# Extract the features
dataset_info = dataset_builder.info.features

In [21]:
print(dataset_builder.info.description)

Amazon Review 2023 is an updated version of the Amazon Review 2018 dataset.
This dataset mainly includes reviews (ratings, text) and item metadata (desc-
riptions, category information, price, brand, and images). Compared to the pre-
vious versions, the 2023 version features larger size, newer reviews (up to Sep
2023), richer and cleaner meta data, and finer-grained timestamps (from day to 
milli-second).

This is a subset for items in domain: All_Beauty.


In [20]:
print(dataset_builder.info.features)

{'main_category': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'average_rating': Value(dtype='float64', id=None), 'rating_number': Value(dtype='int64', id=None), 'features': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'description': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'price': Value(dtype='string', id=None), 'images': Sequence(feature={'hi_res': Value(dtype='string', id=None), 'large': Value(dtype='string', id=None), 'thumb': Value(dtype='string', id=None), 'variant': Value(dtype='string', id=None)}, length=-1, id=None), 'videos': Sequence(feature={'title': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None), 'user_id': Value(dtype='string', id=None)}, length=-1, id=None), 'store': Value(dtype='string', id=None), 'categories': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'details': Value(dtype='string', id=None), 'parent_asin': Value(dtype='string', id=None

In [16]:
from huggingface_hub import HfApi

api = HfApi()

# Get datasets sorted by most downloads
datasets = api.list_datasets(sort="downloads", direction=-1, limit=10)

# Print the top 10 most downloaded datasets
print([dataset.id for dataset in datasets])

['huggingface/documentation-images', 'Symato/cc', 'AquaV/genshin-voices-separated', 'hf-doc-build/doc-build-dev', 'hf-doc-build/doc-build', 'huggingchat/models-logo', 'm-a-p/FineFineWeb', 'agents-course/course-images', 'hf-internal-testing/transformers_circleci_workflow_runs', 'open-cn-llm-leaderboard/requests']


In [9]:
from huggingface_hub import list_datasets

datasets = list(list_datasets())  # Convert generator to a list
print([d.id for d in datasets[:10]])  # Print first 10 dataset names


['facebook/natural_reasoning', 'Congliu/Chinese-DeepSeek-R1-Distill-data-110k', 'SynthLabsAI/Big-Math-RL-Verified', 'FreedomIntelligence/medical-o1-reasoning-SFT', 'allenai/olmOCR-mix-0225', 'open-r1/OpenR1-Math-220k', 'SakanaAI/AI-CUDA-Engineer-Archive', 'open-thoughts/OpenThoughts-114k', 'fka/awesome-chatgpt-prompts', 'arcinstitute/opengenome2']


## Using the model on multiple data at a time using pipeline

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset_builder, load_dataset
import pandas as pd
import os

# Check if the model directory exists
if not os.path.exists(save_directory):
    print(f"Model directory not found. Downloading model {model_id}...")
    
    # Create the models directory if it doesn't exist
    os.makedirs("models", exist_ok=True)
    
    # Download model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_id)
    model.save_pretrained(save_directory)
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.save_pretrained(save_directory)
    
    print("Model and tokenizer downloaded successfully!")
else:
    print(f"Using locally saved model from {save_directory}")

# Create a sentiment analysis pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Load the Amazon Reviews dataset
print("Loading dataset...")
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", 'raw_meta_All_Beauty')

# Get the first 10 rows of the dataset
# Typically reviews are in the 'train' split, but let's check what's available
print("Available splits:", dataset.keys())

# Assuming there's a 'train' split, get the first 10 rows
split_name = 'train' if 'train' in dataset else list(dataset.keys())[0]
reviews_subset = dataset[split_name].select(range(10))

# Check available columns to find the review text column
print("Available columns:", reviews_subset.column_names)

# Usually review text is in columns like 'reviewText', 'review_text', or similar
# Let's identify a text column
text_column_candidates = ['reviewText', 'review_text', 'review', 'text']
text_column = None

for column in text_column_candidates:
    if column in reviews_subset.column_names:
        text_column = column
        break

if text_column is None:
    # If none of our guesses match, let's look for any column that might contain text
    for column in reviews_subset.column_names:
        if reviews_subset[column][0] and isinstance(reviews_subset[column][0], str) and len(reviews_subset[column][0]) > 20:
            text_column = column
            break

if text_column is None:
    print("Could not find a suitable text column. Available columns are:", reviews_subset.column_names)
    # Show a sample of each column
    for column in reviews_subset.column_names:
        print(f"Sample of '{column}': {reviews_subset[column][0]}")
else:
    print(f"Using '{text_column}' as the review text column")

    # Apply sentiment analysis on the first 10 reviews
    results = []
    for i, review in enumerate(reviews_subset[text_column]):
        if review and isinstance(review, str):
            # Some datasets might have None values or non-string types
            sentiment = classifier(review)
            results.append({
                'review_idx': i,
                'text': review[:100] + "..." if len(review) > 100 else review,  # Truncate long reviews for display
                'sentiment': sentiment[0]['label'],
                'score': sentiment[0]['score']
            })
            print(f"Review {i+1}: {review[:100]}{'...' if len(review) > 100 else ''}")
            print(f"Sentiment: {sentiment[0]['label']}, Score: {sentiment[0]['score']:.4f}")
            print("-" * 50)
        else:
            print(f"Review {i+1}: Invalid review text (None or non-string)")
            print("-" * 50)

    # Create a DataFrame for easier analysis
    results_df = pd.DataFrame(results)
    print("\nSummary of sentiment analysis:")
    print(results_df['sentiment'].value_counts())
    print("\nAverage confidence score:", results_df['score'].mean())

Device set to use cpu


Using locally saved model from models/distilbert-base-uncased-finetuned-sst-2-english
Loading dataset...
Available splits: dict_keys(['full'])
Available columns: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author']
Using 'title' as the review text column
Review 1: Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)
Sentiment: NEGATIVE, Score: 0.9533
--------------------------------------------------
Review 2: Yes to Tomatoes Detoxifying Charcoal Cleanser (Pack of 2) with Charcoal Powder, Tomato Fruit Extract...
Sentiment: NEGATIVE, Score: 0.9818
--------------------------------------------------
Review 3: Eye Patch Black Adult with Tie Band (6 Per Pack)
Sentiment: NEGATIVE, Score: 0.9753
--------------------------------------------------
Review 4: Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4D Imitation Eyebrow Tattoos, 4D Hair-like