# CV Ranking Model

## Preprocessing Data

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import os
import fitz
import re
class TextExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.directory = None
        self.pdf_path = None

    def fit(self, X, y=None):
        # X is expected to be a dictionary with 'directory' or 'pdf_path'
        if 'directory' in X:
            self.directory = X['directory']
        elif 'pdf_path' in X:
            self.pdf_path = X['pdf_path']
        else:
            raise ValueError("Input X must contain either 'directory' or 'pdf_path'.")
        return self

    def transform(self, X=None):
        if self.pdf_path:
            # Return a dictionary with a single entry for the single PDF
            return {self.pdf_path: self.extract_text_from_pdf(self.pdf_path)}
        elif self.directory:
            # Return a dictionary of texts extracted from all PDFs in the directory
            return self.load_documents(self.directory)
        else:
            raise ValueError("Either directory or pdf_path must be provided.")
    
    def extract_text_from_pdf(self, pdf_path):
        pdf_document = fitz.open(pdf_path)
        extracted_text = ''
        for page_number in range(len(pdf_document)):
            page = pdf_document.load_page(page_number)
            text = page.get_text()
            extracted_text += text + "\n\n"
        return extracted_text

    def load_documents(self, directory):
        documents = {}
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            documents[filename] = self.extract_text_from_pdf(file_path)
        return documents

In [13]:
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X=None, y=None):
        return self

    def transform(self, document_dict):
        if not isinstance(document_dict, dict):
            raise ValueError("Input must be a dictionary where keys are filenames and values are texts.")
        return {key: self.clean_text(text) for key, text in document_dict.items()}
    
    def clean_text(self, text):
        text = re.sub(r'\n+', ' ', text)
        text = re.sub(r'\s+([.,!?;:])', r'\1', text)
        text = re.sub(r'\s*[\u2022\u25AA\u25AB]\s*', ' ', text)
        text = re.sub(r'\s*-\s*', ' - ', text)
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.lower() 
        return text.strip()

In [14]:
# Initialize the pipeline with the custom transformers
pipeline = Pipeline([
    ('text_extractor', TextExtractor()),
    ('text_cleaner', TextCleaner())
])

# Example of fitting with a directory path
transformed_cvs = pipeline.fit_transform({'directory': "C:/Users/User/LLM/CVs/"})

# Example of fitting with a single PDF path
transformed_job = list(pipeline.fit_transform({'pdf_path': r"C:/Users/User/LLM/Job_Desc/Podcaster Job Ad.pdf"}).values())

In [4]:
transformed_job

['job description: are you a finance enthusiast with a knack for storytelling? do you have a passion for educating and inspiring others when it comes to personal finance, investing, and financial literacy? if so, we have an exciting opportunity for you! pan finance is seeking a talented finance podcaster to join our team. responsibilities: podcast content creation: produce engaging and informative podcast episodes covering various finance topics, from budgeting and investing to financial planning and market trends. scriptwriting: research and develop well - structured scripts for podcast episodes, ensuring content is clear, accurate, and tailored to our target audience. host and co - host: host podcast episodes, interview guests, and co - host discussions to provide valuable insights and perspectives on finance - related subjects. editing and post - production: oversee the editing and post - production process of podcast episodes to ensure high - quality content and a seamless listener

In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Function to clean text
def clean_text(text):
    # Tokenize and remove punctuation and stopwords
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Example cleaned CVs and job description
cleaned_cvs = {name: clean_text(text) for name, text in transformed_cvs.items()}
cleaned_job = clean_text(transformed_job[0])

# Prepare texts for vectorization
texts = list(cleaned_cvs.values()) + [cleaned_job]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)

# Calculate similarity scores
cv_tfidf_matrix = tfidf_matrix[:-1]
job_tfidf_vector = tfidf_matrix[-1]
similarity_scores = cosine_similarity(cv_tfidf_matrix, job_tfidf_vector.reshape(1, -1))

# Create a dictionary of CV names and their relevance scores
relevance_scores = {cv_name: similarity_scores[i][0] for i, cv_name in enumerate(cleaned_cvs.keys())}

# Sort CVs based on relevance scores
sorted_cvs = sorted(relevance_scores.items(), key=lambda item: item[1], reverse=True)

# Print sorted CVs with their relevance scores
for cv_name, score in sorted_cvs:
    print(f'{cv_name}: {score:.4f}')

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


CV Podcast Host_Producer Damien  Swaby  (1).pdf: 0.1992
Violetta_Nadbitova_CV-1.pdf: 0.1339
Kindra Keener 2024 CV.pdf: 0.0492
CV Martin Kriletich 2024.pdf: 0.0479
Benjamin_Salebaigi.pdf: 0.0243


## Data Preparation for Model fine tuning

In [5]:
import pandas as pd

# Example similarity scores
similarity_scores = {
    'CV Podcast Host_Producer Damien  Swaby  (1).pdf': 0.1992,
    "Violetta_Nadbitova_CV-1.pdf": 0.1339,
    "Kindra Keener 2024 CV.pdf": 0.0492,
    "CV Martin Kriletich 2024.pdf": 0.0479,
    "Benjamin_Salebaigi.pdf": 0.0243
}

# Define a threshold to classify CVs as relevant or not
threshold = 0.1

# Prepare data using processed CVs and job descriptions
data = []
for cv_name in transformed_cvs.keys():
    cv_text = transformed_cvs[cv_name]  # Get CV text from transformed_cvs
    job_description = transformed_job[0]  # Assuming transformed_job is a list with job description
    
    # Get the similarity score for the CV
    score = similarity_scores[cv_name]
    
    # Create label based on the threshold
    label = 1 if score > threshold else 0
    
    data.append({
        'cv_name': cv_name,
        "cv_text": cv_text,
        "job_description": job_description,
        "label": label
    })

# Create DataFrame
df = pd.DataFrame(data)

# Output the DataFrame
df

Unnamed: 0,cv_name,cv_text,job_description,label
0,Benjamin_Salebaigi.pdf,benjamin salebaigi 2011 - 70 temperance street...,job description: are you a finance enthusiast ...,0
1,CV Martin Kriletich 2024.pdf,martin kriletich global business manager work ...,job description: are you a finance enthusiast ...,0
2,CV Podcast Host_Producer Damien Swaby (1).pdf,"07984921407 lmmakerswaby@hotmail.com london, n...",job description: are you a finance enthusiast ...,1
3,Kindra Keener 2024 CV.pdf,contact education (917) 288 - 5445 phone kindr...,job description: are you a finance enthusiast ...,0
4,Violetta_Nadbitova_CV-1.pdf,violetta nadbitova 07725003082 | venadbitova@g...,job description: are you a finance enthusiast ...,1


In [6]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Convert DataFrame to Dataset
data = {
    "cv_text": df["cv_text"].tolist(),
    "job_description": df["job_description"].tolist(),
    "label": df["label"].tolist()
}

dataset = Dataset.from_dict(data)

In [7]:
from sklearn.model_selection import train_test_split
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_function(examples):
    return tokenizer(
        examples['cv_text'], examples['job_description'], 
        padding='max_length', truncation=True
    )

# Apply tokenization
tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_list = tokenized_dataset.to_pandas()
train_df, eval_df = train_test_split(data_list, test_size=0.4)

# Convert DataFrames back to Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

## Fine Tuning Language model with the provided dataset

In [14]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=2,
    num_train_epochs=20,
    weight_decay=0.09,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset  # Include eval_dataset
)

# Train the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.189684
2,No log,0.191802
3,No log,0.193943
4,No log,0.168458
5,No log,0.134089
6,No log,0.11214
7,No log,0.097239
8,No log,0.086043
9,No log,0.077286
10,No log,0.070341


TrainOutput(global_step=40, training_loss=0.11552071571350098, metrics={'train_runtime': 704.7159, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.057, 'total_flos': 15786663321600.0, 'train_loss': 0.11552071571350098, 'epoch': 20.0})

## Model Evaluation

In [66]:
transformed_cvs.keys()

dict_keys(['Benjamin_Salebaigi.pdf', 'CV Martin Kriletich 2024.pdf', 'CV Podcast Host_Producer Damien  Swaby  (1).pdf', 'Kindra Keener 2024 CV.pdf', 'Violetta_Nadbitova_CV-1.pdf'])

In [16]:
# Evaluate the model on the validation dataset
evaluation_metrics = trainer.evaluate()

# Print out the evaluation metrics
print("Evaluation Metrics:")
for key, value in evaluation_metrics.items():
    print(f"{key}: {value:.4f}")

Evaluation Metrics:
eval_loss: 0.0453
eval_runtime: 5.6043
eval_samples_per_second: 0.3570
eval_steps_per_second: 0.1780
epoch: 20.0000


## Model saving

In [20]:
# Saving the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

('./fine-tuned-model\\tokenizer_config.json',
 './fine-tuned-model\\special_tokens_map.json',
 './fine-tuned-model\\vocab.txt',
 './fine-tuned-model\\added_tokens.json')

## Loading the model and Tokenizer

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('./fine-tuned-model')
tokenizer = AutoTokenizer.from_pretrained('./fine-tuned-model')

## Testing model with the trained Data

In [9]:
input_texts = [f"{cv} [SEP] {transformed_job[0]}" for cv in transformed_cvs]
tokenized_inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

# Get model predictions
with torch.no_grad():
    outputs = model(**tokenized_inputs)
    predictions = torch.softmax(outputs.logits, dim=1)[:, 1]  # Probability of being relevant

# Rank CVs based on predictions
ranked_cvs = sorted(zip(transformed_cvs.keys(), predictions.numpy()), key=lambda x: x[1], reverse=True)

# Output ranked CVs
for i, (cv, score) in enumerate(ranked_cvs):
    print(f"Rank {i+1}: {cv} with score {score:.4f}")

Rank 1: Violetta_Nadbitova_CV-1.pdf with score 0.8981
Rank 2: CV Podcast Host_Producer Damien  Swaby  (1).pdf with score 0.8972
Rank 3: CV Martin Kriletich 2024.pdf with score 0.8954
Rank 4: Benjamin_Salebaigi.pdf with score 0.8922
Rank 5: Kindra Keener 2024 CV.pdf with score 0.8919


## Testing Model with new CVs and Job description

In [5]:
new_cvs = pipeline.fit_transform({'directory': "C:/Users/User/LLM/Test_cv/"})

In [6]:
new_job = list(pipeline.fit_transform({'pdf_path': r"C:/Users/User/LLM/Job_Desc/Job_desc.pdf"}).values())

In [7]:
new_cvs

{'Johanson_Onyegbula_CV.pdf': 'onyegbula, johanson chibuike +2348139378788 | johansononyegbula20@gmail.com | www.linkedin.com/in/johanson - onyegbula - 6484bb76/ education university of lagos (unilag), nigeria 2014 2019 bachelor of science (bsc.) in surveying and geoinformatics first class (honours) 4.85/5.00 b.sc. thesis: quality assessment of 20m spot dem using gps ground control points for lagos state, nigeria. conference presentations university of lagos, nigeria october, 2019 alademomi, a.s, okolie, c.j, ojegbile, b.m., daramola, o.e., onyegbula, j.c. and adepo, r.o. (2019). modelling and analysis of environmental noise levels within the university of lagos main campus. presented at the faculty of engineering international conference, university of lagos, october 15 - 18, 2019. doi: 10.13140/rg.2.2.32447.46240 nwilo, p.c., okolie, c.j., onyegbula, j.c., abolaji, o.e., orji, m.j. and daramola, o.e. (2019). validation of the 20 - metre spot dem using ground control points from lagos

In [8]:
new_job

["job title: junior data engineer location: araromi, okolowo, ilorin, kwara, nigeria company overview: we are a dynamic and innovative company focused on leveraging data to drive business success. we are committed to fostering a collaborative environment where creative minds can thrive and make impactful contributions. join us as we shape the future of data - driven decision - making. job description: we are seeking a highly skilled and motivated junior data engineer to join our team. the ideal candidate will have a strong background in data engineering, data science, and machine learning, with experience in developing and deploying data - driven solutions. the candidate will work closely with cross - functional teams to ensure the smooth and efficient flow of data within the organization. key responsibilities: data integration and management: conduct research and implementation of data collection tools, such as fluentd, to streamline log data collection and integration processes. gene

In [21]:
import torch
input_texts = [f"{cv} [SEP] {new_job[0]}" for cv in new_cvs]
tokenized_inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

# Get model predictions
with torch.no_grad():
    outputs = model(**tokenized_inputs)
    predictions = torch.softmax(outputs.logits, dim=1)[:, 1]  # Probability of being relevant

# Rank CVs based on predictions
ranked_cvs = sorted(zip(new_cvs.keys(), predictions.numpy()), key=lambda x: x[1], reverse=True)

# Output ranked CVs
for i, (cv, score) in enumerate(ranked_cvs):
    print(f"Rank {i+1}: {cv} with score {score:.4f}")

Rank 1: Yussuf-Resume.pdf with score 0.7438
Rank 2: Taofik CV RPI.pdf with score 0.6710
Rank 3: Johanson_Onyegbula_CV.pdf with score 0.6612
Rank 4: MUTALIB Tunde Lawal; CV.pdf with score 0.6456


# Generating Summary for each CV

In [22]:
from transformers import pipeline

# Load the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text(text):
    summary = summarizer(text, max_length=200, min_length=50, do_sample=False)
    return summary[0]['summary_text']



In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline

# Load summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text(text):
    try:
        # Check if the text is too long and truncate if necessary
        max_length = 1024  # Max token length for BART
        if len(text.split()) > max_length:
            text = ' '.join(text.split()[:max_length])  # Truncate to max_length tokens

        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return text  # Return original text if summarization fails

def extract_relevant_paragraphs(cv_text, job_description_text):
    # Tokenize and vectorize the texts
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([cv_text, job_description_text])
    
    # Compute similarity
    similarity_matrix = cosine_similarity(vectors)
    return similarity_matrix[0, 1]

def generate_summary(cv_text, job_description_text):
    # Summarize the CV and Job Description
    cv_summary = summarize_text(cv_text)
    job_summary = summarize_text(job_description_text)
    
    # Compare the summaries
    extract_relevant_paragraphs(cv_summary, job_summary)
    
    return cv_summary

for cv_name, cv_text in transformed_cvs.items():
    cv_summary = generate_summary(cv_text, transformed_job[0])
    print(f"Summary for {cv_name}:")
    print(f"CV Summary:\n{cv_summary}\n")




Summary for Benjamin_Salebaigi.pdf:
CV Summary:
Benjamin salebaigi 2011 - 70 temperance street, toronto, on, m5h 4e8 (604) 889 - 2622; benjaminsalebaigi@gmail.com; https://www.linkedin.com/in/benjamin-salebaigi/ education johnson graduate school of management, cornell university, ithaca, ny, usa master of international management (mim), applied economics stream (cems) dual degree ivey business school, western university, london,. on, canada january 2022 - may 2023 master of science in management (msc), international business stream. work experience kaseya, vancouver,. canada (

Summary for CV Martin Kriletich 2024.pdf:
CV Summary:
 martin kriletich is a global business management professional, with a strong background in business development. martin brings a wealth of experience and a passion for business expansion, coupled with a blend of creative thinking, analytical prowess, and a can - do attitude.

Summary for CV Podcast Host_Producer Damien  Swaby  (1).pdf:
CV Summary:
Damien swa

In [34]:
from transformers import pipeline
import re

# Load a pre-trained summarization model
summarizer = pipeline("summarization")

def summarize_text(text, max_length=500, min_length=200):
    """Generate a summary of the provided text with adjustable length parameters."""
    try:
        summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return text

def extract_sentences(summary_text, num_sentences=4):
    """Extract a specific number of sentences from the summary text."""
    sentences = re.split(r'(?<=[.!?])\s+', summary_text.strip())
    # Ensure we get at least num_sentences
    if len(sentences) < num_sentences:
        return sentences
    return sentences[:num_sentences]

def format_summary(summary_text):
    """Format the summary into two paragraphs with at least two sentences each."""
    sentences = extract_sentences(summary_text)
    
    # Ensure there are at least 4 sentences
    if len(sentences) < 4:
        return summary_text.strip()
    
    # Combine sentences into two paragraphs
    first_paragraph = ' '.join(sentences[:2])
    second_paragraph = ' '.join(sentences[2:])
    
    return f"{first_paragraph}\n\n{second_paragraph}"

def process_cvs(cvs_texts):
    """Generate and print two-paragraph summaries for each CV."""
    for cv_name, cv_text in cvs_texts.items():
        summary = summarize_text(cv_text)
        formatted_summary = format_summary(summary)
        print(f"Summary for {cv_name}:")
        print(f"CV Summary:\n{formatted_summary}\n")

process_cvs(transformed_cvs)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Summary for Benjamin_Salebaigi.pdf:
CV Summary:
Benjamin salebaigi 2011 - 70 temperance street, toronto, on, m5h 4e8 (604) 889 - 2622; benjaminsalebaigi@gmail.com; http://www.linkedin.com/in/benjamin-salebaigi/ education johnson graduate school of management, cornell university, ithaca, ny, usa master of international management (mim), applied economics stream (cems) dual degree ivey business school, western university, london, on,. on, canada (msc) master of science in management (msc), international business stream dual degree . work experience kaseya, vancouver (on - site) may 2023 - jan 2024 account manager quota carrying account managers datto pod (full - time) managed a diverse book of business, providing tailored it efficiency consulting, business reviews, and strategies to drive revenue growth and client retainment for managed service providers (msps)

Summary for CV Martin Kriletich 2024.pdf:
CV Summary:
martin martin kriletich is a global business management professional, wit

Token indices sequence length is longer than the specified maximum sequence length for this model (1056 > 1024). Running this sequence through the model will result in indexing errors


Summary for Kindra Keener 2024 CV.pdf:
CV Summary:
kindra keener has over 15 years of success selling professional services . Lead internal and external relationship building activities with basis vectors founders, global saas client engagement team of business development and marketing resources to drive growth and revenue development within the portfolio of $20+m sales pipeline .

manage client relationships to support and drive the execution of acquisition and growth strategy. oversee business development budgets, ensuring timely and accurate resources to achieve strategic revenue objectives support the post - close integration of new acquisitions and organic growth initiatives.

Error summarizing text: index out of range in self
Summary for Violetta_Nadbitova_CV-1.pdf:
CV Summary:
violetta nadbitova 07725003082 | venadbitova@gmail.com | https://www.linkedin.com/in/violettanadbitova/ | no visa sponsorship required | salford, united kingdom | immediate availability | geographically m

In [36]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import re

# Load model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

def summarize_text(text):
    # Handle long text by chunking
    max_chunk_size = 1024
    text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

    summary_parts = []
    for chunk in text_chunks:
        try:
            summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
            summary_parts.append(summary)
        except Exception as e:
            print(f"Error summarizing text: {e}")

    return " ".join(summary_parts)

def create_cv_summary(cv_text):
    try:
        # Summarize text
        summary = summarize_text(cv_text)

        # Split summary into sentences
        sentences = re.split(r'(?<=[.!?])\s+', summary)
        
        # Ensure at least 4 sentences are used
        if len(sentences) < 4:
            sentences += [''] * (4 - len(sentences))
        
        # Create paragraphs
        para1 = ' '.join(sentences[:2])
        para2 = ' '.join(sentences[2:4])

        return f"{para1}\n\n{para2}"

    except Exception as e:
        print(f"Error processing CV: {e}")
        return "Error processing CV."

def summarize_cv_dict(cv_dict):
    summaries = {}
    for name, text in cv_dict.items():
        summaries[name] = create_cv_summary(text)
    return summaries


summaries = summarize_cv_dict(transformed_cvs)
for name, summary in summaries.items():
    print(f"Summary for {name}:\n{summary}\n")

Your max_length is set to 150, but your input_length is only 40. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
Your max_length is set to 150, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max_length is set to 150, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 150, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
Your

Summary for Benjamin_Salebaigi.pdf:
Benjamin salebaigi 2011 - 70 temperance street, toronto, on, m5h 4e8 (604) 889 - 2622; benjaminsalebaigi@gmail.com. education johnson graduate school of management, cornell university, ithaca, ny, usa master of international management (mim), applied economics stream (cems) dual degree ivey business school, western university, london.

kaseya, vancouver, canada (on - site) may 2023 - jan 2024 account manager quota carrying account managers datto pod (full - time) managed a diverse book of business. leveraged linkedin to multi - thread within partner s companies to have multiple champions within the org.

Summary for CV Martin Kriletich 2024.pdf:
 martin kriletich global business manager work experience 2023 - 2024. kin + carta | buenos aires partner development manager responsible for building and maintaining relationships with our mach partners.

martin martin kriletich is a global business management professional. He has a strong background in rugb