In [1]:
!pip install sacrebleu
!pip install rouge



In [2]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BartTokenizer, TFBartForConditionalGeneration
from sacrebleu import corpus_bleu
from rouge import Rouge

In [3]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
# Preprocess data (fill missing values)
train_df.fillna('', inplace=True)
test_df.fillna('', inplace=True)

In [5]:
# Preparing input_text and target_text
train_df['input_text'] = train_df[['Job Title', 'Preferred Qualifications', 'Hiring Company', 'Applicant Name', 'Past Working Experience', 'Current Working Experience', 'Skillsets', 'Qualifications']].agg(' '.join, axis=1)
train_df['target_text'] = train_df['Cover Letter']

In [6]:
# Replace different types of new line characters with a space
def clean_new_lines(text):
    return text.replace('\n\n', ' ').replace('\r\n', ' ') if pd.notnull(text) else text

train_df['target_text'] = train_df['Cover Letter'].apply(clean_new_lines)

# Trimming white spaces
text_columns = ['Job Title', 'Preferred Qualifications', 'Hiring Company', 'Applicant Name', 'Past Working Experience', 'Current Working Experience', 'Skillsets', 'Qualifications', 'input_text', 'target_text']
for col in text_columns:
    train_df[col] = train_df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)


In [7]:
train_df.head()

Unnamed: 0,Job Title,Preferred Qualifications,Hiring Company,Applicant Name,Past Working Experience,Current Working Experience,Skillsets,Qualifications,Cover Letter,input_text,target_text
0,Senior Java Developer,5+ years of experience in Java Development,Google,John Doe,Java Developer at XYZ for 3 years,Senior Java Developer at ABC for 2 years,"Java, Spring Boot, Hibernate, SQL",BSc in Computer Science,I am writing to express my interest in the Sen...,Senior Java Developer 5+ years of experience i...,I am writing to express my interest in the Sen...
1,Data Scientist,5 years of experience in data modeling,XYZ Analytics Solutions,John Smith,Data Analyst at ABC Corporation,Data Scientist at XYZ Technologies,"Data modeling, data analysis, programming (Pyt...","Bachelor's degree in Computer Science, Master'...","Dear Hiring Manager,\n\nI am writing to expres...",Data Scientist 5 years of experience in data m...,"Dear Hiring Manager, I am writing to express m..."
2,Data Scientist,Experience with Python and proficiency in at l...,XYZ Analytics,John Smith,Data Analyst at ABC Corporation,Data Scientist at XYZ Solutions,"Python, R, SQL, Machine Learning, Statistical ...","Bachelor's degree in Computer Science, Master'...","Dear Hiring Manager,\n\nI am writing to expres...",Data Scientist Experience with Python and prof...,"Dear Hiring Manager, I am writing to express m..."
3,Senior Data Scientist,Minimum of 3 years validated experience\nOutst...,XYZ Tech Solutions,John Smith,Data Analyst at ABC Company,Senior Data Scientist at DEF Corporation,"Python, machine learning libraries, deep learn...","Bachelor's degree in Computer Science, Master'...","Dear Hiring Manager,\n\nI am writing to expres...",Senior Data Scientist Minimum of 3 years valid...,"Dear Hiring Manager, I am writing to express m..."
4,data scientist,35 years+ professional experience in data scie...,xyz tech solutions,john smith,data analyst at abc company,data scientist at xyz tech solutions,"Python, R, Machine Learning, NLP, Data Visuali...",35 years+ professional experience in data scie...,"Dear Hiring Manager,\n\nI am writing to expres...",data scientist 35 years+ professional experie...,"Dear Hiring Manager, I am writing to express m..."


In [8]:
train_df.shape

(813, 11)

In [9]:
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_df['input_text'], train_df['target_text'], test_size=0.2)

In [10]:
# Load tokenizer and model
model_name = 'facebook/bart-base'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = TFBartForConditionalGeneration.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [11]:
# Tokenize data
def tokenize_data(texts, labels, tokenizer, max_length=512):
    if not isinstance(texts, list) or not all(isinstance(text, str) for text in texts):
        raise ValueError("Texts must be a list of strings.")
    if not isinstance(labels, list) or not all(isinstance(label, str) for label in labels):
        raise ValueError("Labels must be a list of strings.")

    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    labels_encoded = tokenizer(labels, truncation=True, padding=True, max_length=max_length).input_ids

    labels_encoded = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels_encoded]

    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels_encoded
    }

try:
    train_data = tokenize_data(train_texts.tolist(), train_labels.tolist(), tokenizer)
    val_data = tokenize_data(val_texts.tolist(), val_labels.tolist(), tokenizer)
except ValueError as e:
    print(e)


In [12]:
# Convert data to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({
    "input_ids": tf.cast(train_data['input_ids'], tf.int32),
    "attention_mask": tf.cast(train_data['attention_mask'], tf.int32),
    "labels": tf.cast(train_data['labels'], tf.int32)
}))

val_dataset = tf.data.Dataset.from_tensor_slices(({
    "input_ids": tf.cast(val_data['input_ids'], tf.int32),
    "attention_mask": tf.cast(val_data['attention_mask'], tf.int32),
    "labels": tf.cast(val_data['labels'], tf.int32)
}))

In [13]:
print("Training set shapes:")
print(train_dataset.element_spec)
print("Validation set shapes:")
print(val_dataset.element_spec)

Training set shapes:
{'input_ids': TensorSpec(shape=(294,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(294,), dtype=tf.int32, name=None), 'labels': TensorSpec(shape=(352,), dtype=tf.int32, name=None)}
Validation set shapes:
{'input_ids': TensorSpec(shape=(259,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(259,), dtype=tf.int32, name=None), 'labels': TensorSpec(shape=(295,), dtype=tf.int32, name=None)}


In [27]:
!pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.2
    Uninstalling transformers-4.38.2:
      Successfully uninstalled transformers-4.38.2
Successfully installed transformers-4.39.3


In [36]:
#Train model on the data
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

model.compile(optimizer=optimizer, metrics=['accuracy'])

batch_size = 2

model.fit(train_dataset.shuffle(1000).batch(batch_size), epochs=100, validation_data=val_dataset.batch(batch_size))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tf_keras.src.callbacks.History at 0x7ec7bce925f0>

In [37]:
# Evaluate the model on the validation dataset
val_loss = model.evaluate(val_dataset.batch(batch_size))
print(f"Validation loss: {val_loss}")

Validation loss: [1.8424181938171387, 0.4376208782196045]


In [38]:
# Test data
test_df['input_text'] = test_df[['Job Title', 'Preferred Qualifications', 'Hiring Company', 'Applicant Name', 'Past Working Experience', 'Current Working Experience', 'Skillsets', 'Qualifications']].agg(' '.join, axis=1)
test_df['target_text'] = test_df['Cover Letter']


In [39]:
test_texts = test_df['input_text'].tolist()
test_labels = test_df['target_text'].tolist()


In [40]:
# Tokenize test data
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)
test_labels_encoded = tokenizer(test_labels, truncation=True, padding=True, max_length=512).input_ids

In [41]:
test_dataset = tf.data.Dataset.from_tensor_slices(({
    "input_ids": test_encodings['input_ids'],
    "attention_mask": test_encodings['attention_mask'],
    "labels": test_labels_encoded
}))

In [43]:
pip install nltk rouge




In [44]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import numpy as np


In [45]:
def calculate_bleu_score(references, candidates):
    bleu_scores = [sentence_bleu([ref.split()], cand.split()) for ref, cand in zip(references, candidates)]
    return np.mean(bleu_scores)

def calculate_rouge_score(references, candidates):
    rouge = Rouge()
    scores = rouge.get_scores(candidates, references, avg=True)
    return scores


In [51]:
def evaluate_model(model, tokenizer, dataset, max_length=40):
    references = []
    candidates = []
    for batch in dataset:
        input_ids = batch['input_ids']  # Key for input IDs
        labels = batch['labels']  # Key for labels

        # Generate predictions
        outputs = model.generate(input_ids, max_length=max_length, num_beams=5)

        # Decode text
        decoded_preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in outputs]
        decoded_labels = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=False) for l in labels.numpy()]

        references.extend(decoded_labels)
        candidates.extend(decoded_preds)

    bleu_score = calculate_bleu_score(references, candidates)
    rouge_score = calculate_rouge_score(references, candidates)
    return {"bleu": bleu_score, "rouge": rouge_score}


In [52]:
# Example of evaluating after training
eval_scores = evaluate_model(model, tokenizer, test_dataset.batch(32))
print(f"Evaluation scores: {eval_scores}")

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Evaluation scores: {'bleu': 0.13723502622214614, 'rouge': {'rouge-1': {'r': 0.36013365740753445, 'p': 0.9073787876793122, 'f': 0.4946354892943846}, 'rouge-2': {'r': 0.2685385171386128, 'p': 0.8207615129904241, 'f': 0.383814783249757}, 'rouge-l': {'r': 0.3563118491701165, 'p': 0.8980918093808405, 'f': 0.48951644880674217}}}


In [42]:
# Evaluate the model
evaluation_results = model.evaluate(test_dataset.batch(8))
print("Evaluation results:", evaluation_results)


Evaluation results: [2.6679768562316895, 0.3943927586078644]


In [25]:
def generate_cover_letter(model, tokenizer, max_length=512):

    job_title = input("Enter the Job Title: ")
    preferred_qualifications = input("Enter Preferred Qualifications: ")
    hiring_company = input("Enter the Hiring Company: ")
    applicant_name = input("Enter the Applicant's Name: ")
    past_experience = input("Enter Past Working Experience: ")
    current_experience = input("Enter Current Working Experience: ")
    skillsets = input("Enter Skillsets: ")
    qualifications = input("Enter Qualifications: ")

    input_data = f"Generate a cover letter for the following applicant. Job Title: {job_title}. The company is {hiring_company}, specializing in {preferred_qualifications}. The applicant, {applicant_name}, has past experience in {past_experience} and is currently working in {current_experience}. They possess skills such as {skillsets} and have qualifications including {qualifications}. Use this information to create a personalized cover letter."

    inputs = tokenizer(input_data, return_tensors="tf", truncation=True, padding="max_length", max_length=max_length)

    output_ids = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )


    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return generated_text

generated_cover_letter = generate_cover_letter(model, tokenizer)
print(generated_cover_letter)


Enter the Job Title: Software Engineer
Enter Preferred Qualifications: 2 years of Java software development 
Enter the Hiring Company: Facebook
Enter the Applicant's Name: Arul Bavan
Enter Past Working Experience: 1 year of programming experience
Enter Current Working Experience: 1 year of software development experience in java
Enter Skillsets: Java, Python
Enter Qualifications: BSc (Honors) in Information Technology from SLIIT
I am writing to express my interest in the Generate a cover letter for the following applicant. With my experience in Java development and my expertise in Python and R, I believe I would be a valuable asset to your team. In my current role as a Software Engineer at Facebook, where I have been responsible for leading a team of developers and working on a range of projects that have allowed me to hone my skills in data analysis and software development. I am able to effectively communicate complex technical concepts to non-technical stakeholders and have a proven

In [26]:
def generate_cover_letter(model, tokenizer, max_length=512):

    job_title = input("Enter the Job Title: ")
    preferred_qualifications = input("Enter Preferred Qualifications: ")
    hiring_company = input("Enter the Hiring Company: ")
    applicant_name = input("Enter the Applicant's Name: ")
    past_experience = input("Enter Past Working Experience: ")
    current_experience = input("Enter Current Working Experience: ")
    skillsets = input("Enter Skillsets: ")
    qualifications = input("Enter Qualifications: ")

    input_data = f"Generate a cover letter for the following applicant. Job Title: {job_title}. The company is {hiring_company}, specializing in {preferred_qualifications}. The applicant, {applicant_name}, has past experience in {past_experience} and is currently working in {current_experience}. They possess skills such as {skillsets} and have qualifications including {qualifications}. Use this information to create a personalized cover letter."

    inputs = tokenizer(input_data, return_tensors="tf", truncation=True, padding="max_length", max_length=max_length)

    output_ids = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return generated_text

generated_cover_letter = generate_cover_letter(model, tokenizer)
print(generated_cover_letter)

Enter the Job Title: Quality Assurance Engineer
Enter Preferred Qualifications: 4 years of QA Experience 
Enter the Hiring Company: RBC Bank
Enter the Applicant's Name: Surya Sivakumar
Enter Past Working Experience: 2 years of experience in manual testing at Virtusa
Enter Current Working Experience: 2 years of automation testing experience at WSO2
Enter Skillsets: Java, Selenium, Python, Unit Testing
Enter Qualifications: BSc Honors in Information Technology
I am writing to express my interest in the Generate a cover letter for the following applicant. My experience at RBC Bank has given me a deep understanding of QA Experience and its practical application. I am confident that my skills in Java, Selenium, Python, and Unit Testing will enable me to effectively analyze and interpret complex data sets. In my current role as a Quality Assurance Engineer at WSO2, I have successfully performed quality assurance processes for various types of data, including anomaly detection and NLP tasks. 

In [35]:
# Save the model in Colab
model_save_path = "/content/bart_model2"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Download the saved model to local machine
import os
from google.colab import files

!zip -r /content/bart_model2.zip {model_save_path}

files.download('/content/bart_model2.zip')


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


  adding: content/bart_model2/ (stored 0%)
  adding: content/bart_model2/generation_config.json (deflated 47%)
  adding: content/bart_model2/config.json (deflated 64%)
  adding: content/bart_model2/tf_model.h5 (deflated 8%)
  adding: content/bart_model2/vocab.json (deflated 68%)
  adding: content/bart_model2/special_tokens_map.json (deflated 85%)
  adding: content/bart_model2/tokenizer_config.json (deflated 76%)
  adding: content/bart_model2/merges.txt (deflated 53%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>