In [None]:
# NLP with Hugging Face Transformers - Comprehensive Tutorial
# This notebook demonstrates various NLP tasks using the Transformers library

import warnings
warnings.filterwarnings('ignore')

# Install required packages if not already installed
try:
    import transformers
    import torch
    import tensorflow as tf
    print("All packages are already installed!")
except ImportError as e:
    print(f"Missing package: {e}")
    print("Please run: pip install transformers torch tensorflow")
    
print(f"Transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"TensorFlow version: {tf.__version__}")

In [None]:
# Additional imports for visualization and utilities
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime

print("Additional libraries imported successfully!")
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## Sentiment Analysis

In [None]:
from transformers import pipeline

# Initialize sentiment analysis pipeline with explicit model
print("Initializing Sentiment Analysis Pipeline...")
try:
    classifier = pipeline(
        "sentiment-analysis", 
        model="distilbert-base-uncased-finetuned-sst-2-english",
        return_all_scores=False
    )
    print("Sentiment Analysis pipeline initialized successfully!")
except Exception as e:
    print(f"Error initializing pipeline: {e}")
    raise

In [None]:
# Test single sentence sentiment analysis
test_sentence = 'What a wonderful day'
result = classifier(test_sentence)
print(f"Input: '{test_sentence}'")
print(f"Result: {result}")
print(f"Confidence: {result[0]['score']:.4f}")

# Let's also test some edge cases
edge_cases = [
    "This is okay, I guess",  # neutral
    "I love this so much!",   # very positive
    "This is terrible and awful",  # very negative
    ""  # empty string
]

print("\nTesting edge cases:")
for text in edge_cases:
    if text:  # Skip empty strings
        try:
            result = classifier(text)
            print(f"'{text}' -> {result[0]['label']} ({result[0]['score']:.4f})")
        except Exception as e:
            print(f"'{text}' -> Error: {e}")
    else:
        print("'(empty string)' -> Skipped")

### Passing Multiple prompts to classifier as list gives output in dictionary

In [None]:
# Batch processing multiple texts
prompts = [
    'I hate this book',
    'I am almost done with my Masters, I have learnt a lot',
    'I love to spend time with my cat'
]

print("Processing multiple texts:")
results = classifier(prompts)

# Create a structured analysis
analysis_data = []
for i, (prompt, result) in enumerate(zip(prompts, results)):
    analysis_data.append({
        'Text': prompt,
        'Label': result['label'],
        'Confidence': result['score'],
        'Length': len(prompt.split())
    })

# Display results in a DataFrame for better visualization
df_results = pd.DataFrame(analysis_data)
print(df_results)

# Visualize the results
plt.figure(figsize=(12, 6))

# Plot 1: Confidence scores
plt.subplot(1, 2, 1)
colors = ['red' if label == 'NEGATIVE' else 'green' for label in df_results['Label']]
plt.bar(range(len(df_results)), df_results['Confidence'], color=colors, alpha=0.7)
plt.title('Sentiment Analysis Confidence Scores')
plt.xlabel('Text Index')
plt.ylabel('Confidence Score')
plt.xticks(range(len(df_results)), [f'Text {i+1}' for i in range(len(df_results))])

# Plot 2: Text length vs confidence
plt.subplot(1, 2, 2)
plt.scatter(df_results['Length'], df_results['Confidence'], 
           c=colors, alpha=0.7, s=100)
plt.title('Text Length vs Confidence')
plt.xlabel('Number of Words')
plt.ylabel('Confidence Score')

plt.tight_layout()
plt.show()

### Using pre-trained models from HuggingFace
I am using it as high level helper and not importing the model directly

In [None]:
# Advanced Emotion Detection with RoBERTa
print("Initializing Emotion Detection Pipeline...")

try:
    emotion_pipe = pipeline(
        "text-classification", 
        model="SamLowe/roberta-base-go_emotions",
        return_all_scores=True  # Get all emotion probabilities
    )
    print("Emotion detection pipeline initialized successfully!")
except Exception as e:
    print(f"Error initializing emotion pipeline: {e}")
    emotion_pipe = None

In [None]:
if emotion_pipe:
    # Test single emotion detection
    test_text = 'How ungrateful can you be?'
    emotion_results = emotion_pipe(test_text)
    
    print(f"Input: '{test_text}'")
    print(f"Top emotion: {emotion_results[0]['label']} (confidence: {emotion_results[0]['score']:.4f})")
    
    # Show top 5 emotions
    print("\nTop 5 emotions detected:")
    for i, result in enumerate(emotion_results[:5]):
        print(f"{i+1}. {result['label']}: {result['score']:.4f}")
    
    # Visualize emotion distribution
    if len(emotion_results) >= 5:
        top_emotions = emotion_results[:10]  # Top 10 emotions
        labels = [r['label'] for r in top_emotions]
        scores = [r['score'] for r in top_emotions]
        
        plt.figure(figsize=(12, 6))
        plt.barh(labels, scores, alpha=0.7)
        plt.title(f'Emotion Distribution for: "{test_text}"')
        plt.xlabel('Confidence Score')
        plt.tight_layout()
        plt.show()
else:
    print("Emotion pipeline not available")

In [11]:
prompt=['I cannot let it happen','I am feeling loved','I will always be with you','I am going to learn it']
pipe(prompt)

[{'label': 'disapproval', 'score': 0.6340829133987427},
 {'label': 'love', 'score': 0.9565239548683167},
 {'label': 'caring', 'score': 0.8449496626853943},
 {'label': 'optimism', 'score': 0.39558449387550354}]

In [19]:
classifier_summerize=pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [16]:
classifier_summerize('Amidst towering structures and bustling activity, a dynamic tapestry of diverse cultures weaves together, creating an atmosphere that pulsates with life. Tall skyscrapers reach for the sky, reflecting ambition and innovation that characterize this urban landscape. The aroma of diverse cuisines wafts through the air, inviting passersby to explore a world of flavors. Hidden parks provide serene retreats, where nature and tranquility offer respite from the urban buzz. Artistic murals adorn walls, telling stories of rich history and dynamic present. Street performers captivate audiences, turning sidewalks into stages for impromptu performances. Every corner seems to harbor a secret, a unique story waiting to be discovered by those who venture off the beaten path. As day turns to night, the skyline transforms into a glittering spectacle, lights dancing in reflection on glass facades. In this landscape of contrasts and constant motion, energy is infectious, leaving an indelible imprint on all who become part of its ever-evolving narrative.')

[{'summary_text': ' Amidst towering structures and bustling activity, a dynamic tapestry of diverse cultures weaves together . Tall skyscrapers reach for the sky, reflecting ambition and innovation that characterize this urban landscape . Hidden parks provide serene retreats, where nature and tranquility offer respite from the urban buzz .'}]

In [23]:
classifier_translate=pipeline('translation_en_to_fr')

No model was supplied, defaulted to t5-base and revision 686f1db (https://huggingface.co/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [25]:
classifier_translate("It is a good day")

[{'translation_text': "C'est une bonne journée"}]

In [26]:
classifier_generate=pipeline("text-generation")

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [27]:
classifier_generate("Today I am feeling sleepy because")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Today I am feeling sleepy because I have to do my job, but you know my job is the rest of my life.\n\nMy problem is I can't take the risk and the risk isn't there because you know I am a professional."}]

In [28]:
classifier_unmask=pipeline("fill-mask")

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [37]:
classifier_unmask("The doctor is unable to make it today because, <mask> car broke down")

[{'score': 0.5158199071884155,
  'token': 39,
  'token_str': ' his',
  'sequence': 'The doctor is unable to make it today because, his car broke down'},
 {'score': 0.22296088933944702,
  'token': 69,
  'token_str': ' her',
  'sequence': 'The doctor is unable to make it today because, her car broke down'},
 {'score': 0.056190524250268936,
  'token': 5,
  'token_str': ' the',
  'sequence': 'The doctor is unable to make it today because, the car broke down'},
 {'score': 0.04049336165189743,
  'token': 127,
  'token_str': ' my',
  'sequence': 'The doctor is unable to make it today because, my car broke down'},
 {'score': 0.03273928165435791,
  'token': 49,
  'token_str': ' their',
  'sequence': 'The doctor is unable to make it today because, their car broke down'}]

In [38]:
classifier_ner=pipeline('ner', grouped_entities=True)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
classifier_ner('My name is Sylvain and I work at Hugging Face in Brooklyn.')

[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [41]:
classifier_ques_ans=pipeline('question-answering')

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [47]:
classifier_ques_ans(question='When is my class?', context='Today the class is scheduled in EDU 115 at six thirty evening. Make sure to submit assignment by 10pm.')

{'score': 0.5736246109008789,
 'start': 43,
 'end': 61,
 'answer': 'six thirty evening'}