In [50]:
import sys
!{sys.executable} -m pip install -q transformers nltk evaluate accelerate rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [1]:
import pandas as pd
# import string
import numpy as np
import transformers
import torch
from datasets import load_dataset

In [15]:
transformers.logging.set_verbosity_info()

## Clearing of text

In [42]:
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [43]:
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")

In [18]:
import csv
data = pd.read_csv("train.csv", nrows = 25000)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           25000 non-null  object
 1   Description  25000 non-null  object
 2   Summary      25000 non-null  object
dtypes: object(3)
memory usage: 586.1+ KB


In [20]:
data.head()

Unnamed: 0,ID,Description,Summary
0,HONEUSHRD71328EXTERNALCZCZ,"Join a team recognized for leadership, innovat...","Achieves successful on-site installation, serv..."
1,HILTGLOBALHOT08QSYEXTERNALZHAPAC,为客人提供优质的服务，介绍水疗服务及水疗产品的知识。确保预订系统运作正常。 . 作为水疗中心...,希尔顿在全球 100 多个国家和地区拥有数以千计的酒店，提供无数令人愉悦的机会。 从敞开的大...
2,HIINGLOBALHOT08MXLEXTERNALENGLOBAL,In addition to performance of the essential fu...,The individual must possess the following know...
3,ACCOGLOBAL22022956ENGLOBAL,<html>.Overnight 5th Class Power Engineer. As ...,"As a certified 5th Class Power Engineer, you w..."
4,FIGLUSJR0225226EXTERNAL,<html>.Performs tasks to ensure compliance wit...,Performs tasks to ensure compliance with work ...


## Clean Data

Use map with datasets to have parallel processing and having faster data cleaning

In [21]:
def remove_htmltags(df):
  df['Description'] = df['Description'].str.replace(r'<[^<>]*>', '', regex=True)
  df['Summary'] = df['Summary'].str.replace(r'<[^<>]*>', '', regex=True)
  return df

data = remove_htmltags(data)

In [39]:
def remove_non_english_text(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) if w.lower() in words)

In [22]:
def filter_non_english(df):
  df["Description"] = df['Description'].apply(remove_non_english_text)
  df["Summary"] = df['Summary'].apply(remove_non_english_text)
  return df

data = filter_non_english(data)


In [40]:
punctuations = "!"#$%&'()*+, -/:;<=>?@[\]^_`{|}~"

def remove_punctuations(text):
    return text.translate(str.maketrans("","", punctuations))

In [23]:
data["Description"] = data["Description"].apply(remove_punctuations)
data["Summary"] = data["Summary"].apply(remove_punctuations)

In [24]:
data = data.replace(r'^\s*$', np.nan, regex=True)

In [None]:
def clean_text_pipeline(df):
  df['Description'] = df['Description'].str.replace(r'<[^<>]*>', '', regex=True)
  df['Summary'] = df['Summary'].str.replace(r'<[^<>]*>', '', regex=True)

  df["Description"] = df['Description'].apply(remove_non_english_text)
  df["Summary"] = df['Summary'].apply(remove_non_english_text)

  df["Description"] = df["Description"].apply(remove_punctuations)
  df["Summary"] = df["Summary"].apply(remove_punctuations)

  df = df.replace(r'^\s*$', np.nan, regex=True)
  return df


In [25]:
data.isna().any()

ID             False
Description     True
Summary         True
dtype: bool

In [26]:
data.dropna(inplace=True)

In [27]:
data.isna().any()

ID             False
Description    False
Summary        False
dtype: bool

In [28]:
data.head()

Unnamed: 0,ID,Description,Summary
0,HONEUSHRD71328EXTERNALCZCZ,Join a team for leadership innovation and dive...,successful on site installation and repair of ...
2,HIINGLOBALHOT08MXLEXTERNALENGLOBAL,In addition to performance of the essential th...,The individual must possess the following know...
3,ACCOGLOBAL22022956ENGLOBAL,Overnight Class Power Engineer As a certified ...,As a certified Class Power Engineer you will p...
4,FIGLUSJR0225226EXTERNAL,to ensure compliance with work group and clien...,to ensure compliance with work group and clien...
5,COGRAU48055EXTERNALENAU,You find us working across all business at Liq...,As a Space Manager for Meat Bakery and you wil...


In [30]:
data[['Description', 'Summary']].to_csv("cleaned_train.csv", index=False)

In [2]:
data = load_dataset("csv", data_files="cleaned_train.csv", split="train", nrows=1000)

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
data

Dataset({
    features: ['Description', 'Summary'],
    num_rows: 1000
})

In [4]:
dataset = data.train_test_split(test_size=0.2, shuffle=True)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

In [21]:
# dataset = train_test_split(data, test_size=0.2, shuffle=True)
# train_dataset = dataset[0].reset_index(drop=True)
# val_dataset = dataset[1].reset_index(drop=True)


In [5]:
train_dataset.shape, val_dataset.shape

((800, 2), (200, 2))

## Training using huggingface

In [6]:
MODEL = 't5-small'
BATCH_SIZE = 16
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments , Trainer
# T5TokenizerFast,

In [9]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)
model = T5ForConditionalGeneration.from_pretrained(MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
# Ensure padding token is set
tokenizer.pad_token = tokenizer.eos_token
if tokenizer.pad_token_id is None:
    raise ValueError("Padding token is not set.")

In [11]:
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['Description']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples['Summary']]
    labels = tokenizer(
        text_target = targets,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
# Apply the function to the whole dataset
""" method allows for parallel processing. Apply custom function
    concurrently on multiple elements of the dataset, making it more efficient.
"""
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS,
    
)

tokenized_valid = val_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

In [13]:
tokenized_train.shape, tokenized_valid.shape

((800, 5), (200, 5))

In [14]:
tokenized_train

Dataset({
    features: ['Description', 'Summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})

In [16]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,

    num_train_epochs=EPOCHS,
    max_steps = -1, # if set will overwrite epochs
    dataloader_num_workers=2,

    ###### Memory optimization
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    gradient_checkpointing = False,
    bf16=False, # Needs Ampere
    fp16=False,
    # deepspeed=False,
    # fsdp=False,
    dataloader_pin_memory = True,

    ###### Better training
    lr_scheduler_type="linear", # check SchedulerType
    warmup_steps = 200,
    weight_decay=0.01,
    learning_rate=0.001,
    eval_steps=200,
    evaluation_strategy='epoch',

    # logging_dir=OUT_DIR,
    save_strategy='epoch',
    logging_steps=200,
    save_total_limit=1,
    # report_to='tensorboard',
    save_safetensors=True, # To save state_dicts instead of whole,
    # save_only_model= False,

    load_best_model_at_end = True,
    metric_for_best_model= "loss",
    greater_is_better=False,
    resume_from_checkpoint = False,
    use_cpu = False
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    # data_collator=lambda data: custom_collate_fn(data, tokenizer=tokenizer)
    # tokenizer=,
    # optimizers = (torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR),
    # callbacks=,
    # preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    # compute_metrics=compute_metrics
)

In [18]:
history = trainer.train()

  self.pid = os.fork()


Epoch,Training Loss,Validation Loss
1,No log,0.070008
2,No log,0.062498
3,No log,0.060557
4,0.097700,0.059633
5,0.097700,0.05787
6,0.097700,0.057828
7,0.097700,0.061982
8,0.031900,0.062516
9,0.031900,0.063743
10,0.031900,0.064537


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [19]:
import evaluate

In [33]:
test_df= pd.read_csv("test.csv", nrows=50)

In [34]:
test_df.head()

Unnamed: 0,ID,Description
0,KUNAGLOBALREQ91464EXTERNALHUHU,"With a keen interest in the development field,..."
1,BOMOGLOBALR220007643EXTERNALFRCA,<html>.Delivers exceptional service to BMO cus...
2,KSNEUS85616,The preferred candidate will be able to provid...
3,PRHEUSR1050648ENUSEXTERNAL,<br>The Care Management Representative functio...
4,KSNEUS34436,<p>As a Warehouse Associate you will be operat...


In [44]:
def clean_text_pipeline(df):
  df['Description'] = df['Description'].str.replace(r'<[^<>]*>', '', regex=True)
  df["Description"] = df['Description'].apply(remove_non_english_text)
  df["Description"] = df["Description"].apply(remove_punctuations)
  df["Description"] = df["Description"].replace(r'^\s*$', np.nan, regex=True)
  return df

test_df = clean_text_pipeline(test_df)

In [45]:
model_path = f"{OUT_DIR}/checkpoint-300"  # the path where you saved your model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained("t5-small")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [46]:
def summarize_text(text):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding='max_length'
    )

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=5,
        early_stopping=True
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [47]:
predictions = test_df["Description"].apply(summarize_text)

In [48]:
predictions

0     This development role will incorporate the ini...
1     Works collaboratively within the branch and wi...
2     The preferred candidate will be able to provid...
3     The Care Management Representative as a core t...
4     As a Warehouse Associate you will be operate d...
5     As a project leader will use their strong lead...
6     Responsible for providing primary support and ...
7                                        en da El ya da
8     As a Data Scientist at you actively shape the ...
9     ESSENTIAL OF THE ROLE and equipment safely and...
10    The estimate the time frame to be one month As...
11    The Tech will be responsible for Quality Assur...
12    As a Cook you would be responsible for and qua...
13    Join a team that and highly complex within You...
14    The Buyer customer support by working as a tea...
15    On site in Top of Excel report creation or sch...
16    This is an entry level position that will be u...
17    You will work closely with specialist Prod

In [54]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    # predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    # decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=eval_pred,
        references=eval_pred,
        use_stemmer=True,
        rouge_types=[
            'rougeL'
        ]
    )

    return result

    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)

    # return {k: round(v, 4) for k, v in result.items()}

In [53]:
rouge.compute(predictions=predictions[:5], references=predictions[:5], use_stemmer=True)

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}

In [56]:
pred = [compute_metrics(x)["rougeL"] for x in predictions]

In [57]:
pred

[0.8823529411764706,
 0.8633093525179856,
 0.8663101604278075,
 0.8418367346938775,
 0.8135593220338984,
 0.8403755868544601,
 0.8613861386138614,
 0.7142857142857143,
 0.8278145695364238,
 0.8506493506493507,
 0.8571428571428571,
 0.8505747126436781,
 0.8355263157894737,
 0.821656050955414,
 0.8431372549019608,
 0.8347107438016529,
 0.8324607329842932,
 0.8536585365853658,
 0.8723404255319149,
 0.8666666666666667,
 0.8125,
 0.8461538461538461,
 0.8611111111111112,
 0.8446601941747572,
 0.8634361233480177,
 0.7802197802197802,
 0.8571428571428571,
 0.8333333333333334,
 0.8543689320388349,
 0.8620689655172413,
 0.8202247191011236,
 0.8525345622119815,
 0.84,
 0.8423645320197044,
 0.84,
 0.8585858585858586,
 0.8395061728395061,
 0.8591549295774648,
 0.8817204301075269,
 0.8402061855670103,
 0.8095238095238095,
 0.8636363636363636,
 0.8472222222222222,
 0.8489208633093526,
 0.8721804511278195,
 0.8468468468468469,
 0.8700564971751412,
 0.723404255319149,
 0.8421052631578947,
 0.8390243902