In [7]:
!pip install transformers
!pip install datasets
from transformers import AutoModelForSeq2SeqLM, T5ForConditionalGeneration,AutoTokenizer
from datasets import load_dataset, load_metric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

In [10]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
from datasets import load_dataset

In [6]:
path_file='/content/drive/MyDrive/generated_job_descriptions_and_linked_resumes.csv'
dataset_generated=load_dataset('csv', data_files=[path_file],split='train')

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-200750bfc16e08c0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-200750bfc16e08c0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


In [7]:
dataset_generated.features

{'Cover Letter': Value(dtype='string', id=None),
 'Job Description': Value(dtype='string', id=None),
 'Linked Resume': Value(dtype='string', id=None)}

In [8]:
dataset_generated = load_dataset('csv', data_files=path_file, split='train[:80%]')



In [9]:
dataset_generated

Dataset({
    features: ['Cover Letter', 'Job Description', 'Linked Resume'],
    num_rows: 273
})

In [10]:
dev_set = load_dataset('csv', data_files=path_file, split='train[81%:]')



In [11]:
dataset_generated

Dataset({
    features: ['Cover Letter', 'Job Description', 'Linked Resume'],
    num_rows: 273
})

In [12]:
def add_prefix(data):
    data['Job description and Linked Resume'] = "give me a cover letter based on the a job description and a resume. Job description:"+data['Job Description'] +"Resume:"+ data['Linked Resume']
    return data 

In [13]:
dataset_generated=dataset_generated.map(add_prefix, remove_columns=["Job Description",'Linked Resume'])

Map:   0%|          | 0/273 [00:00<?, ? examples/s]

In [14]:
dev_set=dev_set.map(add_prefix, remove_columns=["Job Description",'Linked Resume'])

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [15]:
# tokenized_inputs[0]

In [16]:
from datasets import concatenate_datasets


tokenized_inputs = dataset_generated.map(lambda x: tokenizer(x["Job description and Linked Resume"], truncation=True), batched=True
                                         , remove_columns=['Cover Letter', 'Job description and Linked Resume'])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = dataset_generated.map(lambda x: tokenizer(x["Cover Letter"], truncation=True), batched=True
                                          , remove_columns=['Cover Letter', 'Job description and Linked Resume'])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/273 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/273 [00:00<?, ? examples/s]

Max target length: 512


In [17]:
#for the dev set

tokenized_inputs_dev = dev_set.map(lambda x: tokenizer(x["Job description and Linked Resume"], truncation=True), batched=True
                                         , remove_columns=['Cover Letter', 'Job description and Linked Resume'])
max_source_length_dev = max([len(x) for x in tokenized_inputs_dev["input_ids"]])
print(f"Max source length: {max_source_length_dev}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets_dev = dev_set.map(lambda x: tokenizer(x["Cover Letter"], truncation=True), batched=True
                                          , remove_columns=['Cover Letter', 'Job description and Linked Resume'])
max_target_length_dev = max([len(x) for x in tokenized_targets_dev["input_ids"]])
print(f"Max target length: {max_target_length_dev}")

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Max target length: 480


In [18]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [ item for item in sample["Job description and Linked Resume"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["Cover Letter"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_dataset = dataset_generated.map(preprocess_function, batched=True, remove_columns=['Cover Letter', 'Job description and Linked Resume'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset.features)}")

Map:   0%|          | 0/273 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [19]:
dataset_generated

Dataset({
    features: ['Cover Letter', 'Job description and Linked Resume'],
    num_rows: 273
})

In [20]:
def preprocess_function_dev(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [ item for item in sample["Job description and Linked Resume"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length_dev, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["Cover Letter"], max_length=max_target_length_dev, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_dataset_dev = dev_set.map(preprocess_function, batched=True, remove_columns=['Cover Letter', 'Job description and Linked Resume'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset_dev.features)}")

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [21]:
dev_set

Dataset({
    features: ['Cover Letter', 'Job description and Linked Resume'],
    num_rows: 65
})

In [22]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [23]:
!pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24954 sha256=d55b9ab3dce10d2862d7a802c8e2452009de756da7d83a0686052f5c5f96e47d
  Stored in directory: /root/.cache/pip/wheels/9b/3d/39/09558097d3119ca0a4d462df68f22c6f3c1b345ac63a09b86e
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [24]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [25]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [26]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [27]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
model_id="google/flan-t5-large"
# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-covergenie"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,gradient_accumulation_steps=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    # num_train_epochs=5,
    optim='adafactor',
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=25,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=50,
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    max_steps=50,
    eval_steps =25,
    metric_for_best_model ='eval_loss',
    greater_is_better =False,
    # hub_model_id=repository_id,
    # hub_token=HfFolder.get_token(),
)


In [28]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_dev,
    compute_metrics=compute_metrics,
)

In [29]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
25,1.8458,1.47207,4.9918,4.1267,4.9283,4.9373,19.0
50,1.6936,1.450809,5.0402,4.1807,4.9712,4.9857,19.0


TrainOutput(global_step=50, training_loss=1.7696937561035155, metrics={'train_runtime': 3898.8001, 'train_samples_per_second': 0.821, 'train_steps_per_second': 0.013, 'total_flos': 7197790310498304.0, 'train_loss': 1.7696937561035155, 'epoch': 11.43})

In [30]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pwd

/content


In [31]:
trainer.save_model("/content/drive/MyDrive/my_model") 

In [11]:
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/my_model")


In [12]:
trainer

NameError: ignored

In [13]:
loaded_model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [14]:
JD='The medical records clerk will be responsible for organizing and maintaining the medical records of the patients at the hospital. The clerk will work with the nursing staff to ensure that the records are accurate and up to date. The clerk will also work with the doctors and other medical staff to ensure that the records are complete and accurate. The clerk will also be responsible for keeping the records confidential and keeping them safe from unauthorized access.'

In [15]:
resume_text= 'Amy Chan Medical Records Clerk -3 years of experience working in medical records departments in hospitals -Able to work well with other staff members and doctors to ensure accuracy of records -Confidential and able to keep records safe from unauthorized access -Nursing degree from St. Helena University'

In [16]:
final_text="give me a cover letter based on the a job description and a resume. Job description:"+JD +" Resume:"+ resume_text

In [22]:
max_source_length=512
max_target_length=512
import nltk

In [18]:
final_text

'give me a cover letter based on the a job desciption and a resume. Job desciption:The medical records clerk will be responsible for organizing and maintaining the medical records of the patients at the hospital. The clerk will work with the nursing staff to ensure that the records are accurate and up to date. The clerk will also work with the doctors and other medical staff to ensure that the records are complete and accurate. The clerk will also be responsible for keeping the records confidential and keeping them safe from unauthorized access. Resume:Amy Chan Medical Records Clerk -3 years of experience working in medical records departments in hospitals -Able to work well with other staff members and doctors to ensure accuracy of records -Confidential and able to keep records safe from unauthorized access -Nursing degree from St. Helena University'

In [23]:
inputs = tokenizer(final_text, max_length=max_source_length, truncation=True, return_tensors="pt")
output = loaded_model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=max_target_length)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
print(predicted_title)

Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 370-5656 Amy Chan (212) 3

In [24]:
from transformers import GenerationConfig

In [25]:
generation_config = GenerationConfig.from_pretrained("google/flan-t5-large",temperature=2.0)

In [26]:
# generation_config = GenerationConfig.from_pretrained("./my_model")

In [27]:
inputs = tokenizer(final_text, max_length=max_source_length, truncation=True, return_tensors="pt")
output = loaded_model.generate(**inputs, num_beams=3, do_sample=True, min_length=1000,
                               max_length=10000,generation_config=generation_config,num_return_sequences=3)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

In [28]:
print(predicted_title)

Amy Chan (212) 370-7910 adamchan@email.com 28-Aug-19 Dear Hiring Manager, As an individual with a desire to work at a hospital where the health of the patients is top priority, I am eager to begin working at the hospital that will best serve your patients.


In [29]:
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
print(predicted_title)

Amy Chan (212) 370-7910 adamchan@email.com 28-Aug-19 Dear Hiring Manager, As an individual with a desire to work at a hospital where the health of the patients is top priority, I am eager to begin working at the hospital that will best serve your patients.


In [31]:
 nltk.sent_tokenize(tokenizer.batch_decode(output, skip_special_tokens=True)[0])

['Amy Chan (212) 370-7910 adamchan@email.com 28-Aug-19 Dear Hiring Manager, As an individual with a desire to work at a hospital where the health of the patients is top priority, I am eager to begin working at the hospital that will best serve your patients.',
 "I am excited to be employed at Saint Mary's Health System, and I look forward to fulfilling this need for a medical records clerk.",
 'I work in medical records departments in hospitals and have worked in other departments.',
 'I am capable of working with both doctors and other staff members to ensure that the medical records are up to date and that they are completely complete.',
 'I have an interest in helping to keep our patients as safe and secure as I can.',
 'I have a desire to continue my career at the hospital.',
 'I have spent five years in medical records departments in several hospitals, so I will bring some of my knowledge and skill that I gained from that time.',
 'I enjoy working with the nursing staff to ensure 

In [45]:
new_list=[]
k=0
for i in nltk.sent_tokenize(tokenizer.batch_decode(output, skip_special_tokens=True)[0]):
    new_list.append(i)
    print(i)
    k=k+1
    if ( 'Amy Chan' in i and  k>2):
        
        break

Amy Chan (212) 370-7910 adamchan@email.com 28-Aug-19 Dear Hiring Manager, As an individual with a desire to work at a hospital where the health of the patients is top priority, I am eager to begin working at the hospital that will best serve your patients.
I am excited to be employed at Saint Mary's Health System, and I look forward to fulfilling this need for a medical records clerk.
I work in medical records departments in hospitals and have worked in other departments.
I am capable of working with both doctors and other staff members to ensure that the medical records are up to date and that they are completely complete.
I have an interest in helping to keep our patients as safe and secure as I can.
I have a desire to continue my career at the hospital.
I have spent five years in medical records departments in several hospitals, so I will bring some of my knowledge and skill that I gained from that time.
I enjoy working with the nursing staff to ensure that the records of patients a

In [41]:
new_list

['Amy Chan (212) 370-7910 adamchan@email.com 28-Aug-19 Dear Hiring Manager, As an individual with a desire to work at a hospital where the health of the patients is top priority, I am eager to begin working at the hospital that will best serve your patients.']