In [1]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from transformers import pipeline
from datasets import Dataset, DatasetDict
import evaluate
import wandb
import numpy as np

Load pretrained model and dataset.

In [3]:
# model

checkpoint = "T5_model_full_dataset_10epoch_training/checkpoint-26500"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device="cpu")

In [5]:
# dataset
data_path = ".."

def make_dataset(dataframe):
    data = dataframe[["dialogue", "summary"]]
    dataset = Dataset.from_pandas(data)
    return dataset
    
data_val = pd.read_csv(os.path.join(data_path, "output_validation.csv"))
# data_train = pd.read_csv(os.path.join(data_path, "output_train.csv"))

# dataset_train = make_dataset(data_train)
dataset_val = make_dataset(data_val)
dataset_dict = DatasetDict({
    # "train": dataset_train,
    "validation": dataset_val,
})

### Log final results

In [10]:
# run = wandb.init(project="tg-summarizer", resume=True)
run = wandb.init(id="0239ihaa", project="tg-summarizer", resume=True)




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114486462126176, max=1.0…

In [18]:
from tqdm.auto import tqdm

n_samples = 10
validation_data = dataset_dict["validation"].select(range(n_samples))

table = wandb.Table(columns=["Input Text", "Target Summary", "Generated Summary"])

# Process each example in the validation dataset and append to the table_data list
for example in tqdm(validation_data):
    input_text = example["dialogue"]  # Replace with the actual key in your dataset
    target_summary = example["summary"]  # Replace with the actual key in your dataset

    # Generate summary using the pipeline
    generated_summary = summarizer(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Append row to table_data
    table.add_data(input_text, target_summary, generated_summary[0]["summary_text"])

# Create a WandB Table and log it
run.log({"summarization_after_fine_tuning": table})



  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
wandb.finish()

### Upload the model to Hugging Hub

In [None]:
# !pip install huggingface_hub

In [16]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
public_name = "ru_t5_chat_sum"
model.push_to_hub(public_name)
tokenizer.push_to_hub(public_name)


model.safetensors:   0%|          | 0.00/977M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/828k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Hacker1337/ru_t5_chat_sum/commit/559a923d461b1c371aa8a32b426347c86b93106f', commit_message='Upload tokenizer', commit_description='', oid='559a923d461b1c371aa8a32b426347c86b93106f', pr_url=None, pr_revision=None, pr_num=None)

## Uploading dataset to huggingface hub.

In [5]:
# dataset
data_path = ".."

def make_dataset(dataframe):
    data = dataframe[["dialogue", "summary"]]
    dataset = Dataset.from_pandas(data)
    return dataset
    
data_val = pd.read_csv(os.path.join(data_path, "output_validation.csv"))
data_train = pd.read_csv(os.path.join(data_path, "output_train.csv"))
data_test = pd.read_csv(os.path.join(data_path, "output_test.csv"))

dataset_train = make_dataset(data_train)
dataset_val = make_dataset(data_val)
dataset_test = make_dataset(data_test)
dataset_dict = DatasetDict({
    "train": dataset_train,
    "validation": dataset_val,
    "test": dataset_test,
})

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
dataset_dict.push_to_hub("Hacker1337/ru_dialogsum")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

In [9]:
from datasets import load_dataset

loaded_dataset = load_dataset("Hacker1337/ru_dialogsum") 
loaded_dataset

Downloading readme:   0%|          | 0.00/4.48k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/374k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/472k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['dialogue', 'summary'],
        num_rows: 500
    })
    test: Dataset({
        features: ['dialogue', 'summary'],
        num_rows: 1500
    })
})