In [None]:
!pip install transformers datasets pandas torch

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments


# Load the CSV file
df = pd.read_csv('datasets.csv')

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split

# Load dataset
def load_data(csv_path):
    df = pd.read_csv(csv_path)
    df = df.dropna()
    train_texts, val_texts, train_summaries, val_summaries = train_test_split(
        df['text'].tolist(), df['summary'].tolist(), test_size=0.1
    )
    return train_texts, val_texts, train_summaries, val_summaries

# Convert to Hugging Face Dataset
def convert_to_hf_dataset(texts, summaries, tokenizer, max_length=512):
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
    labels = tokenizer(summaries, truncation=True, padding='max_length', max_length=150)
    dataset = HFDataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels['input_ids']
    })
    return dataset

# Load model and tokenizer
def load_model():
    model_name = "t5-small"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    return model, tokenizer, device

# Train model
def train_model(train_dataset, val_dataset, model, tokenizer):
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        save_steps=500,
        save_total_limit=2,
        logging_dir="./logs",
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    trainer.train()
    model.save_pretrained("./summarization_model")
    tokenizer.save_pretrained("./summarization_model")

# Summarization function
def summarize(text, model, tokenizer, device, max_length=150):
    input_text = "summarize: " + text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Move input tensors to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        output = model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)

    return tokenizer.decode(output[0], skip_special_tokens=True)

if __name__ == "__main__":
    csv_path = "/content/datasets.csv"  # Update with your file path
    train_texts, val_texts, train_summaries, val_summaries = load_data(csv_path)

    model, tokenizer, device = load_model()
    train_dataset = convert_to_hf_dataset(train_texts, train_summaries, tokenizer)
    val_dataset = convert_to_hf_dataset(val_texts, val_summaries, tokenizer)

    train_model(train_dataset, val_dataset, model, tokenizer)

    test_text = "The Hugging Face Transformers library provides an easy-to-use interface for working with state-of-the-art natural language processing models.It supports a wide range of tasks, including text classification, question answering, and text summarization.With just a few lines of code, you can fine-tune pre-trained models on your own dataset and deploy them for inference."
    summary = summarize(test_text, model, tokenizer, device)
    print("Generated Summary:", summary)



Epoch,Training Loss,Validation Loss
1,0.503,0.218897
2,0.2672,0.20122
3,0.2401,0.191538
4,0.221,0.18572
5,0.2212,0.186989


Generated Summary: the Hugging Face Transformers library provides an easy-to-use interface for working with state-of-the-art natural language processing models. It supports a wide range of tasks, including text classification, question answering, and text summarization.


In [None]:
model.save_pretrained("./summarization_model")
tokenizer.save_pretrained("./summarization_model")


('./summarization_model/tokenizer_config.json',
 './summarization_model/special_tokens_map.json',
 './summarization_model/spiece.model',
 './summarization_model/added_tokens.json')