# Inference to generate to **heading, Summary** and **Tags**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets pandas rouge_score

import pandas as pd
from datasets import Dataset, load_metric
from transformers import LEDTokenizer, LEDForConditionalGeneration
import torch

# load tokenizer and model (fine-tuned)





Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m 

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Input ids are automatically padded from 512 to 1024 to be a multiple of `config.attention_window`: 1024


Generated Heading: ['Virat Kohli']
Generated Summary: ['Virat Kohli']
Generated Tags: ['Virat Kohli']


In [86]:
tokenizer = LEDTokenizer.from_pretrained("/content/drive/MyDrive/checkpoint-60")
model1 = LEDForConditionalGeneration.from_pretrained("/content/drive/MyDrive/checkpoint-60").to("cuda").half()


In [87]:
# sample data
sample_paragraph = """In recent years, artificial intelligence (AI) has made significant strides in various
industries, revolutionizing the way we live and work. From autonomous vehicles to
personalized recommendations, AI-powered solutions have enhanced efficiency
and productivity. However, with these advancements come concerns about ethics,
privacy, and the future of employment. As AI continues to evolve, it is essential to
strike a balance between innovation and responsibility."""
df_test = pd.DataFrame([sample_paragraph], columns=['Paragraph'])
df_test = Dataset.from_pandas(df_test)


In [88]:
# Function to generate headings, summaries, and tags
def generate_text(batch):
    inputs_dict = tokenizer(batch["Paragraph"], padding="max_length", max_length=512, return_tensors="pt", truncation=True)
    input_ids = inputs_dict.input_ids.to("cuda")

    attention_mask = inputs_dict.attention_mask.to("cuda")
    global_attention_mask = torch.zeros_like(attention_mask)
    predicted_heading_ids = model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, max_length=64)
    batch["generated_heading"] = tokenizer.batch_decode(predicted_heading_ids, skip_special_tokens=True)

    # Generate summary
    predicted_summary_ids = model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, max_length=150)
    batch["generated_summary"] = tokenizer.batch_decode(predicted_summary_ids, skip_special_tokens=True)

    # Generate tags (comma-separated)
    # Increased max_length to allow the model to generate additional tokens for tags
    predicted_tags_ids = model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, max_length=20, num_beams=5, no_repeat_ngram_size=2, length_penalty=2.0, early_stopping=True)
    tags = tokenizer.batch_decode(predicted_tags_ids, skip_special_tokens=True)

    # Return a list of tags instead of a comma-separated string
    batch["generated_tags"] = tags

    return batch

In [59]:
result = df_test.map(generate_text, batched=True, batch_size=2)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [74]:


print("Generated Heading:", result["generated_heading"])
print("Generated Summary:", result["generated_summary"])
print("Generated Tags:", result["generated_tags"])

Generated Heading: ['Advancements and ethics in artificial intelligence']
Generated Summary: ['The rapid advancements in artificial intelligence (AI) and its impact on various industries. While AI has brought numerous benefits, such as increased efficiency and personalization, it also raises ethical concerns, privacy issues, and potential job displacement.']
Generated Tags: ['Focus on innovation and responsibility.']


we arent getting expected tags so lets do with bert text generation

In [61]:
from transformers import BertTokenizer, BertForMaskedLM
from transformers import LineByLineTextDataset
from transformers import Trainer, TrainingArguments

# Load Polish Bert Tokenizer

In [90]:
model = BertForMaskedLM.from_pretrained('dkleczek/bert-base-polish-uncased-v1')

Some weights of the model checkpoint at dkleczek/bert-base-polish-uncased-v1 were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Data loading and pre-processing

In [63]:
max_length = 0
with open('/content/dataset.txt', 'r', encoding='utf-8') as f: #dataset includes common sentences
    for line in f:
        length = len(line.strip()) #normal preprocessing step
        if length > max_length:
            max_length = length


In [64]:
##seq length
with open('/content/dataset.txt', 'r', encoding='utf-8') as f:
    for line in f:
        tokens = tokenizer(line.strip(), padding='max_length', truncation=True, max_length=128)
        print(len(tokens['input_ids']))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128

# creating dataset

In [65]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/content/dataset.txt",
    block_size=max_length, # Set the block_size to 128
)




# train args

In [66]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_total_limit=2,
    learning_rate=2e-5,
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=500,
    save_strategy='epoch',
)

In [67]:
#train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

In [68]:

model.save_pretrained('./fine_tuned_bert_model')

In [73]:
model.eval()

# Function to generate tags by masking words in the input paragraph
def generate_tags(paragraph):
    # Tokenize the input paragraph
    inputs = tokenizer(paragraph, return_tensors="pt", max_length=128, truncation=True, padding="max_length")
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Mask a token for prediction (e.g., the first token after [CLS])
    input_ids[0, 1] = tokenizer.mask_token_id

    # Convert some tokens to [MASK] for masked token prediction
    mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

    # Predict masked tokens
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits

    # Get the top 5 predictions for the masked tokens
    mask_token_logits = logits[0, mask_token_index, :]
    top_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

    # Convert token IDs to words and return them as potential tags
    predicted_tokens = [tokenizer.decode([token]) for token in top_tokens]
    return predicted_tokens

paragraph = """In recent years, artificial intelligence (AI) has made significant strides in various industries, revolutionizing the way we live and work."""

tags = generate_tags(paragraph)
print("Generated Tags:", tags)

Generated Tags: ['AI', 'automation', 'privacy']


# LETS GO FOR FINAL INFERENCE

In [91]:
# Function to generate headings, summaries, and tags

def generate_text(batch):
    inputs_dict = tokenizer(batch["Paragraph"], padding="max_length", max_length=512, return_tensors="pt", truncation=True)
    input_ids = inputs_dict.input_ids.to("cuda")

    attention_mask = inputs_dict.attention_mask.to("cuda")
    global_attention_mask = torch.zeros_like(attention_mask)
    predicted_heading_ids = model1.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, max_length=64)
    batch["generated_heading"] = tokenizer.batch_decode(predicted_heading_ids, skip_special_tokens=True)

    # Generate summary
    predicted_summary_ids = model1.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, max_length=150)
    batch["generated_summary"] = tokenizer.batch_decode(predicted_summary_ids, skip_special_tokens=True)



    return batch

def generate_tags(paragraph):
    #tokenize
    inputs = tokenizer(paragraph, return_tensors="pt", max_length=128, truncation=True, padding="max_length")
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # mask a token for prediction (e.g., the first token after [CLS])
    input_ids[0, 1] = tokenizer.mask_token_id

    # convert some tokens to [MASK] for masked token prediction
    mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

    # predict masked tokens
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits

    #top 5 predictions for the masked tokens
    mask_token_logits = logits[0, mask_token_index, :]
    top_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

    # convert token IDs to words and return them as potential tags
    predicted_tokens = [tokenizer.decode([token]) for token in top_tokens]
    return predicted_tokens

sample_paragraph = """In recent years, artificial intelligence (AI) has made significant strides in various
industries, revolutionizing the way we live and work. From autonomous vehicles to
personalized recommendations, AI-powered solutions have enhanced efficiency
and productivity. However, with these advancements come concerns about ethics,
privacy, and the future of employment. As AI continues to evolve, it is essential to
strike a balance between innovation and responsibility."""
df_test = pd.DataFrame([sample_paragraph], columns=['Paragraph'])
df_test = Dataset.from_pandas(df_test)
tags = generate_tags(sample_paragraph)
result = df_test.map(generate_text, batched=True, batch_size=2)

print("Generated Heading:", result["generated_heading"])
print("Generated Summary:", result["generated_summary"])
print("Generated Tags:", tags)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Generated Heading: ['Advancements and ethics in artificial intelligence']
Generated Summary: ['The rapid advancements in artificial intelligence (AI) and its impact on various industries. While AI has brought numerous benefits, such as increased efficiency and personalization, it also raises ethical concerns, privacy issues, and potential job displacement.']
Generated Tags: ['AI', 'automation', 'privacy']
