In [None]:
import json
import re
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [12]:
with open('papers.SSN.jsonl', 'r', encoding='utf-8') as f:
#    print(f)
    data = [json.loads(line) for line in f]

In [13]:
from datasets import Dataset, DatasetDict

In [None]:
len(data)

In [18]:
# Define target domains
target_domains = {"Sociology", "History", "Biology", "Geography", "Geology"}

# Initialize lists
paper_id, title, abstract, text, domain = [], [], [], [], []

# Process entries
for entry in data:
    if 'domain' in entry and any(d in entry['domain'] for d in target_domains):
        paper_id.append(entry['paper_id'])
        title.append(entry['title'])

        # Clean abstract
        abstract_cl = "".join(
            re.sub(r"[^a-zA-Z0-9\s]", "", sentence) 
            for sublist in entry['abstract'] 
            for sentence in sublist
        )
        abstract.append(abstract_cl)

        # Clean text
        text_cl = " ".join(
            re.sub(r"[^a-zA-Z0-9\s]", "", sentence) 
            for sublist in entry['text'] 
            for sentence in sublist
        )
        text.append(text_cl)

        domain.append(entry['domain'])

In [None]:
tokens = tokenizer(text)

In [22]:
min_length = min(len(paper_id), len(title), len(abstract), len(text), len(domain))

paper_id = paper_id[:min_length]
title = title[:min_length]
abstract = abstract[:min_length]
text = text[:min_length]
domain = domain[:min_length]

In [24]:
# Create the Dataset object
ssn_dataset = Dataset.from_dict({
    "paper_id": paper_id,
    "title": title,
    "abstract": abstract,
    "text": text,
    "domain": domain
})

In [26]:
ssn_dataset = ssn_dataset.train_test_split(test_size=0.2)

In [28]:
from transformers import AutoTokenizer

In [30]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [32]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["abstract"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
tokenized_ssn_dataset = ssn_dataset.map(preprocess_function, batched=True)

In [36]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [38]:
import evaluate
rouge = evaluate.load("rouge")

In [40]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./tmp_test",  # Temporary output directory
    learning_rate=1e-5,
    num_train_epochs=10,  # Reduce epochs
#    max_steps=2,  # Run only 2 steps
    per_device_train_batch_size=16,  # Increase batch size
    per_device_eval_batch_size=16,
#    per_device_train_batch_size=1,  # Smallest batch size
#    per_device_eval_batch_size=1,  # Smallest batch size
    eval_strategy="epoch",  # No evaluation for speed
    save_strategy="epoch",
    save_total_limit=2,  # Keep only the 2 most recent models
    logging_strategy="steps",
    logging_steps=200,  # Log every 200 steps
    fp16=True,
    gradient_accumulation_steps=2,  # Effective batch size is doubled
    predict_with_generate=True,
    dataloader_num_workers=4,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ssn_dataset["train"],
    eval_dataset=tokenized_ssn_dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
trainer.push_to_hub()

In [1]:
text = "summarize: Ten settlements from the culture have been found.[1] The first six sites discovered were: the type site at Baodun in Xinjin County, the site at Mangcheng in Dujiangyan City, the site at Yufu in Wenjiang County, the site at Zizhu in Chongzhou, the site at Shuanghe in Chongzhou, and the site at Gucheng in Pi County. Yufucun is the second largest site associated with the Baodun culture. All of the settlements straddle the Min River. The settlement walls were covered with pebbles, a feature unique to the Baodun culture. The pottery from the culture share some similarities with Sanxingdui. The inhabitants lived in wattle and daub houses.[2] The earliest evidence for rice and foxtail millet agriculture in southwest China was discovered at the type site at Baodun.[1]"

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="rcook/tmp_test")
summarizer(text)

In [None]:
results = trainer.evaluate()
print(results)