<a href="https://colab.research.google.com/github/InduwaraGayashan001/Generative-AI/blob/main/HuggingFace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
!pip install transformers

# Sentiment Analysis

In [None]:
from transformers import pipeline

In [None]:
pipeline(task ="sentiment-analysis")("I am happy")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9998801946640015}]

In [None]:
pipeline(task ="sentiment-analysis")\
("Everyday lots of LLMs papers are published about LLm Evaluation.\
Lots of them looks very promising")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.997160792350769}]

In [None]:
pipeline(task ="sentiment-analysis", model="facebook/bart-large-mnli" )\
("Everyday lots of LLMs papers are published about LLm Evaluation.\
Lots of them looks very promising")

# Batch Sentiment Analysis

In [None]:
text = ["I love this movie!", "This movie is okay.", "I hate this movie."]
sentiment_pipeline(text)

[{'label': 'POSITIVE', 'score': 0.9998775720596313},
 {'label': 'POSITIVE', 'score': 0.9998407363891602},
 {'label': 'NEGATIVE', 'score': 0.9996869564056396}]

In [None]:
classifier = pipeline("sentiment-analysis", model = "SamLowe/roberta-base-go_emotions")

In [None]:
text = ["I love this movie!", "This movie is okay.", "I hate this movie."]
classifier(text)

[{'label': 'love', 'score': 0.946032702922821},
 {'label': 'approval', 'score': 0.8260558843612671},
 {'label': 'anger', 'score': 0.7839570641517639}]

# Text Generation

In [None]:
text_generator = pipeline("text-generation")
generated_text = text_generator("Today is a rainy day in london",
                                truncation=True,
                                num_return_sequences = 2)
print(generated_text)

# Question Answering

In [None]:
qa_model = pipeline("question-answering")
question = "What is the capital of France?"
context = "Paris is the capital of France."
qa_model(question = question, context = context)

# Tokenization

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizer,DistilBertForSequenceClassification

In [None]:
model_name2 = "nlptown/bert-base-multilingual-uncased-sentiment"
my_model2 = AutoModelForSequenceClassification.from_pretrained(model_name2)
my_tokenizer2 = AutoTokenizer.from_pretrained(model_name2)

classifier = pipeline("sentiment-analysis", model = my_model2, tokenizer = my_tokenizer2)
res = classifier("I love this movie")
print(res)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
text = "I was so not happy with the Barbie movie"
tokens = tokenizer.tokenize(text)
print(tokens)


In [None]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

[1045, 2001, 2061, 2025, 3407, 2007, 1996, 22635, 3185]


In [None]:
encoded_input = tokenizer(text)
print(encoded_input)

{'input_ids': [101, 1045, 2001, 2061, 2025, 3407, 2007, 1996, 22635, 3185, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
decoded_output = tokenizer.decode(input_ids)
print(decoded_output)

i was so not happy with the barbie movie


# Fine Tuning IMDB

## Load and prepare the dataset

In [None]:
!pip install -U datasets fsspec


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver 

In [None]:
from datasets import load_dataset
dataset = load_dataset('imdb')

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Preprocess the data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [None]:
tokenized_datasets['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to=[]
)

In [None]:
from transformers import AutoModelForSequenceClassification , Trainer
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'].select(range(1000)),
    eval_dataset=tokenized_datasets['test'].select(range(1000)),
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training

In [None]:
trainer.train()

## Evaluation

In [None]:
results = trainer.evaluate()
print(results)

## Save the Model

In [None]:
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

# Arxiv Project

In [None]:
!pip install arxiv

Collecting arxiv
  Downloading arxiv-2.2.0-py3-none-any.whl.metadata (6.3 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.2.0-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=40d4aabb21a68e36eba216d97659fcdf20c5bb458c3105cac32f169fe526c1e0
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packag

In [None]:
import arxiv
import pandas as pd

# Query to etch AI papers
query = 'ai OR artificial Intelligence OR machine learning'
search = arxiv.Search(query = query,
                      max_results = 10,
                      sort_by = arxiv.SortCriterion.SubmittedDate)

# Fetch Papers
papers =[]
for result in search.results():
  papers.append({
      'published': result.published,
      'title': result.title,
      'abstract': result.summary,
      'categories': result.categories
  })

# Convert to dataframe
df = pd.DataFrame(papers)
df.head()

  for result in search.results():


Unnamed: 0,published,title,abstract,categories
0,2025-06-05 17:59:58+00:00,VideoMathQA: Benchmarking Mathematical Reasoni...,Mathematical reasoning in real-world video set...,[cs.CV]
1,2025-06-05 17:59:55+00:00,Inference-Time Hyper-Scaling with KV Cache Com...,Inference-time scaling trades efficiency for i...,"[cs.LG, cs.CL]"
2,2025-06-05 17:59:55+00:00,Why LLM Safety Guardrails Collapse After Fine-...,Recent advancements in large language models (...,"[cs.CR, cs.CL, cs.LG]"
3,2025-06-05 17:59:54+00:00,ContentV: Efficient Training of Video Generati...,Recent advances in video generation demand inc...,[cs.CV]
4,2025-06-05 17:59:51+00:00,Refer to Anything with Vision-Language Prompts,Recent image segmentation models have advanced...,"[cs.CV, cs.AI]"


In [None]:
summarizer = pipeline("summarization")
summarization_result = summarizer(df['abstract'][0])
print(summarization_result)