In [None]:
!pip install transformers==4.31.0  datasets peft trl evaluate rouge_score
# bitsandbytes==0.40.0

Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.11.1-py3-none-any.whl.metadata (12 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9

In [None]:
!pip install --upgrade transformers datasets torch


Collecting transformers
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.31.0
    Uninstalling transformers-4.31.0:
      Successfully uninstalled transformers-4.31.0
Successfully installed tokenizers-0.19.1 transformers-4.44.2


In [None]:
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig,
)
import evaluate
import numpy as np


In [None]:
# Load dataset and rename columns
cnn_dailymail_ds = load_dataset("cnn_dailymail", "3.0.0")
cnn_dailymail_ds = cnn_dailymail_ds.rename_column('article', 'document')
cnn_dailymail_ds = cnn_dailymail_ds.rename_column('highlights', 'summary')



In [None]:

# Split the train set in half
train_size = int(0.15 * len(cnn_dailymail_ds['train']))
half_train_dataset = cnn_dailymail_ds['train'].select(range(train_size))

# Replace the full train set with the halved one
cnn_dailymail_ds['train'] = half_train_dataset

# Verify the new sizes
print(cnn_dailymail_ds)

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 43066
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11490
    })
})


In [None]:
cnn_dailymail_ds

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 43066
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11490
    })
})

In [None]:
from evaluate import load
metric = load("rouge")

In [None]:
# Model loading with quantization-aware optimization
model_name = "google/flan-t5-base"
# bnb_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=6.0)  # Quantization to 8-bit
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [None]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([cnn_dailymail_ds["train"], cnn_dailymail_ds["test"]]).map(lambda x: tokenizer(x["document"], truncation=True), batched=True, remove_columns=["document", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([cnn_dailymail_ds["train"], cnn_dailymail_ds["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["document", "summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/54556 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/54556 [00:00<?, ? examples/s]

Max target length: 512


In [None]:
def tokenize_function(examples,padding="max_length"):
    inputs = [
        "Summarize the following news article:\n\n" + doc + "\nSummary:"
        for doc in examples["document"]
    ]
    model_inputs = tokenizer(
        inputs, max_length=max_source_length, padding="longest", truncation=True
    )

    # Tokenize summaries (labels)
    labels = tokenizer(
        text_target=examples["summary"], max_length=max_target_length, padding="longest", truncation=True
    )

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
cnn_dailymail_ds=cnn_dailymail_ds.remove_columns(["id"])
tokenized_dataset = cnn_dailymail_ds.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["document", "summary"])
#

Map:   0%|          | 0/43066 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 43066
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})

In [None]:
print(tokenized_dataset["train"][0])

{'input_ids': [12198, 1635, 1737, 8, 826, 1506, 1108, 10, 301, 24796, 4170, 6, 2789, 41, 18844, 61, 1636, 8929, 16023, 2213, 4173, 6324, 12591, 15, 11391, 592, 12, 3, 9, 2196, 3996, 1755, 770, 8785, 591, 11039, 770, 61, 13462, 38, 3, 88, 5050, 507, 30, 2089, 6, 68, 3, 88, 10419, 7, 8, 540, 751, 31, 17, 4061, 3, 9, 10783, 30, 376, 5, 4173, 6324, 12591, 15, 38, 8929, 16023, 16, 96, 15537, 651, 16023, 11, 8, 5197, 13, 8, 12308, 121, 304, 8, 19142, 13, 29517, 6710, 343, 7, 300, 8, 296, 6, 8, 1021, 7556, 845, 3, 88, 65, 150, 1390, 12, 9030, 17, 449, 112, 1723, 550, 30, 1006, 2948, 6, 3281, 11, 17086, 2251, 5, 96, 196, 278, 31, 17, 515, 12, 36, 80, 13, 273, 151, 113, 6, 38, 1116, 38, 79, 919, 14985, 8247, 805, 1452, 3, 9, 3805, 2100, 443, 1232, 42, 424, 1126, 976, 3, 88, 1219, 46, 3746, 2772, 49, 2283, 48, 847, 5, 96, 196, 278, 31, 17, 317, 27, 31, 195, 36, 1989, 28887, 5, 96, 634, 378, 27, 114, 2611, 33, 378, 24, 583, 81, 335, 7051, 1636, 1335, 11, 3190, 7, 11, 5677, 7, 535, 486, 14985, 632

In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]


    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
output_dir = './fine_tuned_cnn_output_FINAL'
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    learning_rate=3e-5,
    optim="adafactor",
    weight_decay = 0.01,
    num_train_epochs=3,  # Only 3 bc the dataset is larrge enough
    per_device_train_batch_size=24,   # Smaller  size
    per_device_eval_batch_size=24,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=250,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit = 3,
    load_best_model_at_end=True,
    fp16=True,
    predict_with_generate=True,  # Include generation for evaluation
    generation_max_length=200,   # Match label max_length
    # dataloader_num_workers=0,    # Avoid multiprocessing issues

    # report_to="tensorboard",
    # push_to_hub=False,
    # hub_strategy="every_save",
    # hub_model_id=repository_id,
    # hub_token=HfFolder.get_token(),
)




In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 43066
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0,,35.7812,15.675,25.7587,33.142,57.324978
2,0.0,,35.7812,15.675,25.7587,33.142,57.324978


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0,,35.7812,15.675,25.7587,33.142,57.324978
2,0.0,,35.7812,15.675,25.7587,33.142,57.324978
3,0.0,,35.7812,15.675,25.7587,33.142,57.324978


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=5385, training_loss=0.0, metrics={'train_runtime': 7587.924, 'train_samples_per_second': 17.027, 'train_steps_per_second': 0.71, 'total_flos': 8.84692804215767e+16, 'train_loss': 0.0, 'epoch': 3.0})

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Assuming you have already trained your model and tokenizer is defined
# If not, load them as follows:
# model = AutoModelForSeq2SeqLM.from_pretrained(output_dir)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# Select a small number of examples from the test dataset
num_examples = 10
sampled_dataset = cnn_dailymail_ds["test"].select(range(num_examples))

for idx, example in enumerate(sampled_dataset):
    # Prepare the input text
    input_text = "Summarize the following news article:\n\n" + example["document"] + "\nSummary:"

    # Tokenize the input text
    inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=max_source_length).to(model.device)

    # Generate the summary
    outputs = model.generate(
        inputs,
        max_length=max_target_length,
        num_beams=5,
        early_stopping=True
    )

    # Decode the generated summary
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Original summary
    original_summary = example["summary"]

    # Print the results
    print(f"\nExample {idx + 1}:")
    print("Input Article:")
    print(example["document"])
    print("\nGenerated Summary:")
    print(generated_summary)
    print("\nOriginal Summary:")
    print(original_summary)
    print("\n" + "="*80 + "\n")



Example 1:
Input Article:
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking 

In [None]:
# After training, save model in .safetensors format
model.save_pretrained("FINAL_MODEL_SAFETENSORS", safe_serialization=True)


In [None]:
from transformers import TFAutoModelForSeq2SeqLM

# After training, load the best model
temp_model = TFAutoModelForSeq2SeqLM.from_pretrained("FINAL_MODEL_SAFETENSORS", from_pt=True)  # from_pt=True to load PyTorch model into TF



All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
pip install safetensors




In [None]:
import textwrap

In [None]:
from safetensors.torch import safe_open
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/FlanT5_fine_tune_summerization")
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



In [None]:

text = ":Artificial Intelligence (AI) has seen tremendous growth in recent years, impacting various industries such as healthcare, finance, and education. AI technologies, including machine learning, natural language processing, and computer vision, are being used to solve complex problems more efficiently. In healthcare, AI assists in diagnosing diseases, developing personalized treatments, and streamlining administrative tasks. Meanwhile, in finance, AI enhances fraud detection, algorithmic trading, and risk management. The education sector is also benefiting from AI-driven tools that provide personalized learning experiences and automate grading processes. Despite these advancements, AI still faces challenges, such as ethical concerns, data privacy issues, and the need for explainability in decision-making processes. As AI continues to evolve, addressing these challenges will be critical to ensuring its responsible and widespread adoption across industries."

inputs = tokenizer(text, return_tensors="pt")

# Generate predictions
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


What are the challenges of AI?


In [None]:
text = """US Army boats, which carried out the temporary Gaza pier mission earlier this year, are poorly maintained and largely unprepared to meet the military’s growing mission in the Pacific, a new government oversight report said this week.

The Government Accountability Office released a report on Wednesday that concluded there are “wide-ranging” issues facing Army watercraft, which limit the Army’s ability “to meet mission requirements in the Indo-Pacific theater where the need for Army watercraft is most pronounced.”

Despite Army policy requiring the vessels to be at least at a 90% mission capable rate — meaning the vessels are ready to perform their mission — the boats currently have a less than 40% capable rate this year. Overall, the fleet of watercraft has dropped by nearly half since 2018, going from 134 vessels to 70 as of May this year, in part due to divestment of vessels in 2018 and 2019.

“Army officials stated that these low mission capable rates, along with the smaller size of the watercraft fleet after divestment, hinder operational readiness and the ability to meet mission requirement,” the report said. “Army officials also stated that with such low rates, usually fewer than half the vessels in the fleet are available at any given time.”

The Army’s watercraft came under significant scrutiny this year during the troubled temporary pier mission meant to increase humanitarian aid flow to Palestinians in Gaza. CNN reported in June that the vessels responsible for the temporary pier — called the Joint Logistics Over the Shore, or JLOTS —  were not well resourced or maintained by the Army.

“Army boats have not been ready, capable, or in a mindset they’ll have to do something dangerous or in the real world … for decades now,” a retired warrant officer and former chief engineer on Army watercraft told CNN at the time.


After the temporary pier mission ended — the pier was only operational for roughly 20 days in total with a cost of $230 million — the military requested civilian contractor support in bringing the Army’s vessels back to the US. A defense official said at the time that it’s more cost effective and safer to have them transported back that way.

One vessel is currently being transported back to the US while two others are expected to be loaded for transport this weekend, Army spokeswoman Cynthia Smith said on Friday. Another vessel is undergoing “routine maintenance” which is expected to be completed next week; that vessel is anticipated to be underway by the end of October, Smith said.

Smith said that the Army is “actively” working to address gaps in the watercraft’s capability as a whole, and prioritizing improving the current fleet while also “investing in a modernized fleet to meet the needs of the 2040 force.”

Col. Dave Butler, a spokesman for Army Chief of Staff Gen. Randy George, told CNN that the Army is also looking at possibly replacing the existing fleet of Army watercraft with autonomous vessels in the future.

“What we see is the oil industry and other shipping industries are doing this already, we see that happening all around the world,” Butler said. “There’s no reason the Army shouldn’t be thinking that way … leaders from down at ship level all the way to the Pentagon are looking at this and determining the best way to deploy our forces.”

US soldiers stand next to one of two US Army vessels that ran aground in Israel's coastal city of Ashdod on May 25, 2024. The US military said four of its vessels, supporting a temporary pier built to deliver aid to Gaza by sea, had run aground in heavy seas.
US soldiers stand next to one of two US Army vessels that ran aground in Israel's coastal city of Ashdod on May 25, 2024. The US military said four of its vessels, supporting a temporary pier built to deliver aid to Gaza by sea, had run aground in heavy seas. Oren Ziv/AFP/Getty Images
The GAO report released this week says that “significant maintenance challenges” have contributed to the vessels’ low state of readiness, exacerbated by “aging vessels, supply shortages, and obsolete parts.”

The report includes one example of a Landing Craft Utility vessel which has been under maintenance since 2018. While the vessel was originally scheduled to be repaired and operational by January 2021, it was delayed by at least three years. The Army “had to revise the contract seven times due to the expanded scope of work” after discovering more than 40% of the boat’s hull required “significant unplanned repairs.”

“The expanded scope of work added further delays and costs, exceeding the initial maintenance estimate by over $1.2 million,” the GAO report said.

In another example included in the report, the GAO says the Army identified in 2010 “safety concerns” with the ramp on an LCU, used for loading and unloading people or equipment.

“Despite the risk of catastrophic failure and loss of life, the Army did not replace bow ramp components essential for safety,” the report says, adding that one ramp fell off a LCU vessel in 2022 “in open seas” near Japan. The Army did not appear to act until a briefing from the GAO in 2023, more than 10 years after concerns were first identified. An inspection of all LCUs that followed the GAO briefing that year found roughly one-third of the vessels “failed the inspection and were pending repairs.”

The report includes a response from Army Secretary Christine Wormuth, who says the Army is “actively pursuing a holistic approach to mitigate the gaps in Army watercraft capability and capacity.” In regards to the mission needs in the Pacific specifically, Wormuth said Army Futures Command is working with US Army Pacific and Indo-Pacific Command to address concerns about the watercraft’s mission readiness.

The GAO report also said the Army is considering leasing civilian watercraft to bolster its existing fleet and moving all of its watercraft to the Pacific. Butler also said the Army was actively talking to Congress about leasing civilian vessels, and even hosted representatives recently in Hawaii on the Army watercraft to discuss the benefits of leasing.

The Army established a governing board in February this year to help provide oversight of the watercraft, though the GAO found that as of May, the board hadn’t yet started talking steps on key responsibilities of its oversight role, like establishing how information would be distributed to stakeholders or how frequently the board would meet.

Ultimately, the Army watercraft is “how the ground force, the Army, gets to war,” Butler said.

“Maybe the future fleet is all autonomous, we just don’t know,” he said. “This is all stuff we’re looking at in terms of trying to modernize the way we move people, weapons, and equipment.”

"""

# Tokenize and generate summary
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(**inputs, max_length=200, min_length = 100, num_beams=5, length_penalty=2.0, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(summary)


at the end of the month, Army spokeswoman Cynthia Smith said on Friday, adding that one vessel is currently being transported back to the US and another vessel is undergoing “routine maintenance” which is expected to be completed by the end of the month. The Army’s watercraft, which carried out the temporary Gaza pier mission earlier this year, are poorly maintained and largely unprepared to meet the military’s growing mission in the Pacific, a new government oversight report said this week.


In [None]:
text = """
Climate change is one of the most pressing issues of our time, with widespread impacts on the environment, human health, and the global economy. The scientific community overwhelmingly agrees that human activities, particularly the burning of fossil fuels, have accelerated the natural process of climate change by releasing large amounts of greenhouse gases, such as carbon dioxide, into the atmosphere.

The effects of climate change are already evident in rising global temperatures, melting polar ice caps, and increasing sea levels. Extreme weather events, such as hurricanes, floods, droughts, and wildfires, are becoming more frequent and severe. These changes have devastating effects on ecosystems, biodiversity, and agriculture. The loss of biodiversity, in turn, threatens food security and increases the vulnerability of human populations to diseases and natural disasters.

Moreover, climate change has significant economic implications. As temperatures rise and weather patterns become more unpredictable, industries such as agriculture, fishing, and tourism are facing new challenges. For instance, droughts and floods disrupt crop production, leading to food shortages and higher prices. Coastal communities are at risk due to rising sea levels, which could displace millions of people and cause billions of dollars in property damage. Additionally, health-related costs are expected to increase as heatwaves and pollution exacerbate respiratory and cardiovascular diseases.

Efforts to combat climate change have gained momentum in recent years, with international agreements such as the Paris Agreement setting ambitious goals to limit global warming. Governments, corporations, and individuals are being urged to reduce their carbon footprints by adopting renewable energy sources, improving energy efficiency, and implementing sustainable practices. However, transitioning to a low-carbon economy requires substantial investments in technology, infrastructure, and education. Developing countries, in particular, face additional challenges in balancing economic growth with environmental sustainability.

While significant progress has been made, the pace of change is still too slow to avoid the most severe consequences of climate change. Global leaders are now focusing on mitigation strategies, such as reducing greenhouse gas emissions, and adaptation strategies, which involve adjusting to the changes that are already inevitable. The next few decades will be crucial in determining whether humanity can rise to the challenge of climate change and safeguard the planet for future generations.
"""

# Tokenize and generate summary with adjusted parameters
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(
    **inputs,
    max_length=100,      # Maximum length of the summary
    min_length=100,      # Set minimum word limit here
    num_beams=7,         # Number of beams for beam search
    length_penalty=1.5,  # Encourage longer summaries
    early_stopping=True
)

# Decode and print the summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(summary)


Climate change is one of the most pressing issues of our time, with widespread impacts on the environment, human health, and the global economy. The scientific community overwhelmingly agrees that human activities, particularly the burning of fossil fuels, have accelerated the natural process of climate change by releasing large amounts of greenhouse gases, such as carbon dioxide, into the atmosphere. The effects of climate change are already evident in rising global temperatures, melting polar ice caps, and increasing sea levels.


In [None]:
text = """
Remote work has seen a significant rise in popularity, especially following the global pandemic that forced many companies to adopt new ways of operating. One of the most prominent benefits of remote work is flexibility. Employees have the ability to work from anywhere, eliminating the need for long commutes and allowing for a better work-life balance. This flexibility can lead to increased productivity, as workers can create a personalized environment that suits their preferences and minimizes distractions.

For companies, remote work can result in reduced overhead costs. Without the need for large office spaces, companies can save on rent, utilities, and other office-related expenses. Additionally, it allows businesses to tap into a broader talent pool. By removing geographical barriers, companies can hire employees from different cities, states, or even countries, promoting diversity and inclusion in the workplace.

However, remote work also comes with its own set of challenges. One of the main concerns is communication. Without face-to-face interactions, maintaining clear and effective communication among team members can be difficult. Virtual meetings and email communication may not always convey the full context of a discussion, leading to misunderstandings. Building team camaraderie and maintaining company culture are also challenging in a remote setup, as spontaneous interactions and social events are limited.

Another issue is the potential for burnout. While remote work offers flexibility, it can also blur the boundaries between work and personal life. Many remote workers find themselves working longer hours or being unable to disconnect from work, leading to increased stress and burnout. Additionally, some employees may feel isolated or disconnected from their teams, impacting their mental well-being.

In conclusion, remote work offers many benefits, including flexibility, cost savings, and access to a wider talent pool. However, it also presents challenges, particularly in communication, maintaining work-life balance, and fostering team connections. As companies continue to navigate the future of work, finding the right balance between remote and in-office work may be key to long-term success.
"""

# Tokenize and generate summary with adjusted parameters
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(
    **inputs,
    max_length=150,      # Maximum length of the summary
    min_length=100,      # Set minimum word limit here
    num_beams=7,         # Number of beams for beam search
    length_penalty=1.5,  # Encourage longer summaries
    early_stopping=True
)

# Decode and print the summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)


# Wrap the text
wrapped_text = textwrap.fill(summary, width=50)

# Print the wrapped text
print(wrapped_text)


The benefits of remote work include flexibility,
cost savings, and access to a wider talent pool.
The challenges of remote work include
communication, work-life balance, and fostering
team connections. Find the right balance between
remote and in-office work as companies continue to
navigate the future of work, especially following
the global pandemic that forced many companies to
adopt new ways of operating. Read on to learn how
to balance the benefits of remote work with the
challenges of in-office work and in-office work.


In [None]:
text = """Jane Collins is a 34-year-old software engineer living in Seattle, Washington. She’s known for her curiosity, problem-solving skills, and love for learning. Jane grew up in a small town in Oregon and always had a passion for technology. As a child, she would spend hours tinkering with computers, taking them apart and putting them back together just to understand how they worked.

After completing her degree in computer science, Jane began her career as a junior developer at a small startup. Over the years, she honed her skills in coding, particularly in Python and JavaScript, and quickly rose through the ranks. Today, she works at a leading tech company, developing cutting-edge software solutions for clients worldwide. Her ability to write clean, efficient code has earned her recognition from her peers, and she often mentors junior developers, sharing her knowledge and expertise.

Beyond her professional life, Jane is an advocate for diversity in tech. She’s actively involved in initiatives that aim to bring more women and underrepresented groups into the technology industry. Jane frequently speaks at conferences and participates in workshops that encourage young girls to pursue careers in STEM fields.

In her free time, Jane enjoys hiking and exploring the Pacific Northwest’s natural beauty. On weekends, you can often find her on a trail, surrounded by lush forests or at the foot of a mountain, appreciating nature’s tranquility. She also has a creative side, dabbling in photography and painting, capturing moments from her adventures and turning them into art.

Despite her busy schedule, Jane values balance and mindfulness. She practices yoga regularly and believes in the importance of taking time for self-care. Her friends describe her as driven yet compassionate, always willing to lend a hand and inspire those around her to achieve their best."""


# Tokenize and generate summary with adjusted parameters
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(
    **inputs,
    max_length=150,      # Maximum length of the summary
    min_length=100,      # Set minimum word limit here
    num_beams=7,         # Number of beams for beam search
    length_penalty=1.5,  # Encourage longer summaries
    early_stopping=True
)

# Decode and print the summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)


# Wrap the text
wrapped_text = textwrap.fill(summary, width=50)

# Print the wrapped text
print(wrapped_text)


Jane grew up in a small town in Oregon and always
had a passion for technology. As a child, she
would spend hours tinkering with computers, taking
them apart and putting them back together just to
understand how they worked. Jane began her career
as a junior developer at a small startup. Over the
years, she honed her skills in coding,
particularly in Python and JavaScript, and quickly
rose through the ranks. Jane frequently speaks at
conferences and participates in workshops that
encourage young girls to pursue careers in STEM
fields. In her free time, Jane enjoys hiking and
exploring the Pacific Northwest’s natural beauty.


In [None]:
text = """Lebanon’s foreign minister said he was disappointed with US President Joe Biden’s remarks at the United Nations on the escalating conflict between Israel and Hezbollah, adding he believes the United States “is the only country that can really make a difference” in the region.

Biden only briefly mentioned the widening of fighting in the Middle East in his final speech to the UN General Assembly, which came as cross-border strikes killed hundreds.

“It was not strong. It is not promising, and it would not solve this problem,” Foreign Minister Abdallah Bou Habib said of Biden’s speech during a virtual event hosted by the Carnegie Endowment for International Peace in New York on Tuesday.

Referring to the war in Gaza and fears of a wider regional conflict, Bou Habib said Israel’s government “cannot survive except with wars,” and he also criticized the killing of civilians in Lebanon.

An estimated half a million people have been displaced in Lebanon, Bou Habib added.

Last week, Israel made the safe return of residents to northern Israel a war goal following nearly a year of cross-border exchanges of fire with Hezbollah.

Biden’s comments: “Too many on each side of the Israeli-Lebanon border remain displaced,” he said. “Full-scale war is not in anyone’s interest.”

While acknowledging “the situation has escalated,” he said: “a diplomatic solution is still possible.”"""

# Tokenize and generate summary with adjusted parameters
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(
    **inputs,
    max_length=150,      # Maximum length of the summary
    # min_length=100,      # Set minimum word limit here
    num_beams=7,         # Number of beams for beam search
    length_penalty=1.5,  # Encourage longer summaries
    early_stopping=True,
    temperature=0.1    # Adjust temperature here (e.g., 0.8 for more controlled randomness)
)

# Decode and print the summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Wrap the text
wrapped_text = textwrap.fill(summary, width=50)

# Print the wrapped text
print(wrapped_text)


Abdallah Bou Habib said he was disappointed by
Biden’s remarks on the escalating conflict between
Israel and Hezbollah


In [None]:

text = """Hezbollah on Tuesday said it targeted a naval base on Israel’s northern Mediterranean coast that houses an elite Israeli naval commando unit.

The Shayetet 13 unit that operates out of the Atlit naval base carries out “strategic sabotage activity across enemy lines, including damaging vital structures during war time and the enemy’s seaports,” according to the Israeli military.

Earlier Tuesday, the Israel Defense Forces said sirens were heard in the area of Atlit, south of the city of Haifa. “No injuries were reported,” the IDF said. It did not say whether the naval base was targeted.

Hezbollah has been trading fire with Israel since Monday, when Israel launched strikes across Lebanon that killed more than 500 people, including women and children.The Israeli military says it is targeting the Iran-backed militant group.

Earlier Tuesday, the IDF also said it carried out another round of “extensive strikes” on Hezbollah targets in Lebanon.

Hezbollah previously said it fired multiple rocket barrages into northern Israel overnight through Tuesday, saying it struck the Ramat David Airbase, Megiddo Airfield and the Amos base, all located in the vicinity of the town of Afula, northern Israel.

"""

# Tokenize and generate summary with adjusted parameters
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(
    **inputs,
    max_length=150,      # Maximum length of the summary
    # min_length=100,      # Set minimum word limit here
    num_beams=7,         # Number of beams for beam search
    length_penalty=1.5,  # Encourage longer summaries
    early_stopping=True,
    temperature=1      # Adjust temperature here (e.g., 0.8 for more controlled randomness)
)

# Decode and print the summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Wrap the text
wrapped_text = textwrap.fill(summary, width=50)

# Print the wrapped text
print(wrapped_text)


Israel Defense Forces said sirens were heard in
the area of Atlit, south of Haifa.


In [None]:
text ="""
Challenges in tackling PM2.5 Crisis
Nipon Poapongsakorn, Kamphol Pantakua, and Suthipat Ratchakorn

Chiang Mai has repeatedly been named the world’s most air-polluted city this year. Not exactly a title to be envious about.The city was hit hard by PM2.5 ultra-fine dust on March 31. IQair.com, an air quality reporting website, reported Chiang Mai had an air quality index of 320 with an ultra-fine PM2.5 dust level of 269.2 microgrammes per cubic metre (μg/m3), far beyond the World Health Organization (WHO)’s “safe” limit of 25 μg/m3.
According to the World Bank, PM2.5 dust caused damages of 210 billion baht in 1990. This jumped to 870 billion baht in 2013. Meanwhile, a 2019 study by one economist  found the financial burden from PM2.5 on households in Bangkok and its satellite provinces amounted to 436 billion baht a year.
The impact on human health is clearly the most serious concern. The State of Global Air report blamed toxic haze for 32,200 deaths in 2019. Greenpeace (Thailand) recently linked the ultra-fine dust to 29,000 premature deaths in 36 provinces — making it a much bigger threat than the death toll from traffic accidents, narcotics and homicides.
Fumes from car exhausts, especially diesel engines, and traffic congestion are the major causes of air pollution in Bangkok and other big cities. As of January, there were 3.2 million diesel engine pick-up trucks in Bangkok, or 27.5% of the 11.6 million registered vehicles in the capital. Other causes of toxic dust are the fossil fuels used in power plants, as well as waste burning and household activities. On the other hand, air pollution in rural areas is associated with the open burning of major cash crops such as sugarcane and maize. Due to farm labor shortages and expensive machinery, burning is still seen as the cheapest method. The situation gets worse with the spread of wildfires, intensified by the dryness during this season.
It is a deep-seated structural problem that prevents the government from resolving the haze issue. To begin with, the panel tasked to oversee the PM2.5 crisis operates as an ad hoc body without any consistency.  Most of the budget allocated to tackling haze is concentrated within departments in the central administration but not the local agencies. To give a clear picture, it could be said that of a four-baht request, the Budget Bureau would only hand over one baht to be shared among a group of agencies instead of cooperating and streamlining their tasks.
This greatly affects how the money can cascade down into localities. What happens is the provincial budget keeps dropping — from 48.6 billion baht in 2018 to 19.6 billion baht in 2020 — compared to the cluster budget, which was halved from 16.3 billion baht in 2018 to 8.4 billion baht in 2020. The budget structure for both the provinces and clusters for 2024 is the same as that in 2020.
The government must solve these problems at their core by tackling the inadequate state regulations, ineffective bureaucracy and isolated management. The government cannot stick with its same old centrality; different departments must be given authority to have “departmentocracy”. The government must look at haze as a chronic health and environmental threat, instead of “seasonal air pollution” that will be dissipated over time when the rain comes.
To deal with it in a more effective and focused manner, Thailand needs better quality data on airshed areas. The government must fund research to learn more about the volume and movement of this kind of air pollution. Armed with this information, provinces that fall under the same airshed could work together, and ensure the appropriate budget allocation. Cooperation should also be fostered with neighbouring countries and Thai businesses that promote monocrops like maize or sugarcane in Cambodia, Laos, and Myanmar.
For the agricultural sector, the government should provide subsidies to farmers keen to turn to agricultural machinery, so they can stop burning waste on their farms. Studies and increased development should be promoted for alternative plants that may create new income-generating jobs.
In the long term, the government must strive to establish a mandatory carbon market so that local people will be incentivised to protect the forest and make money from carbon trading. The government must also have the courage to collect a carbon tax, based on the social cost from human activities, so that the national and local governments will have sufficient funds to mitigate the impact and provide remedies to affected parties and areas.
For the transport sector, the government must be more ambitious in aiming for a higher and tougher environmental benchmark. It must launch a policy and plan to have all diesel engines phased out by imposing more diesel tax and replacing these with cleaner electric vehicles (EV) or hydrogen car engines. This means owners must be granted incentives and subsidies to make the transition to cleaner engines.
_________________________________________________
Poapongsakorn, N., Pantakua, K., & Ratchakom, S. (2023). Challenges in tackling the PM2.5 Crisis. Policy analyses from the Thailand Development Research Institute (TDRI).
"""


# Tokenize and generate summary with adjusted parameters
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(
    **inputs,
    max_length=350,      # Maximum length of the summary
    min_length=150,      # Set minimum word limit here
    num_beams=7,         # Number of beams for beam search
    length_penalty=1.5,  # Encourage longer summaries
    early_stopping=True,

)

# Decode and print the summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Wrap the text
wrapped_text = textwrap.fill(summary, width=50)

# Print the wrapped text
print(wrapped_text)
