In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install PyPDF2

In [None]:
!pip install pdfplumber

In [None]:
import os
import re
import pdfplumber
import pandas as pd
import numpy as np

In [None]:
def extract_information(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        resume_text = ""
        for page in pdf.pages:
            resume_text = " ".join([resume_text, page.extract_text()])
    resume_text = resume_text.strip()
    return resume_text

In [None]:
def extract_details(resume_text):
    # Define regular expressions to extract Skills & Education
    skills_pattern = r'Skills\n([\s\S]*?)(?=\n[A-Z]|$)'
    education_pattern = r'Education\n([\s\S]*?)(?=\n[A-Z][a-z]*\n|$)'

    # Get Skills & Education
    skills_match = re.findall(skills_pattern, resume_text, re.DOTALL)
    education_match = re.findall(education_pattern, resume_text, re.DOTALL)

    # Skills & Education
    if len(skills_match)!=0:
        skills = skills_match[0]
    else:
        skills_pattern = r'skills\n((?:.*)*)'
        skills_match = re.findall(skills_pattern, resume_text, re.DOTALL)
        if len(skills_match)!=0:
            skills = skills_match[0]
        else:
            skills = None

    if len(education_match)!=0:
        education = education_match[0]
    else:
        education = None

    return {
        'Skills': skills,
        'Education': education
    }

In [None]:
data_folder = '/content/drive/MyDrive/dataset/Resume/data/data'
resume_data = []

# Iterate through sub-folders and PDF files
for category_folder in os.listdir(data_folder):
    category_path = os.path.join(data_folder, category_folder)
    if os.path.isdir(category_path):
        for pdf_file in os.listdir(category_path):
            if pdf_file.endswith('.pdf'):
                pdf_path = os.path.join(category_path, pdf_file)
                # print(pdf_path)
                text = extract_information(pdf_path)
                details = extract_details(text)

                # Adding Category & ID
                details['ID'] = pdf_file.replace('.pdf', '')
                details['Category'] = category_folder

                # print(f'File: [{pdf_path}]')
                # print(details, end='\n\n')
                resume_data.append(details)

print('PDF Extraction Done!')

In [None]:
resume_df = pd.DataFrame(resume_data)
resume_df.to_csv('./pdf_extracted_skills_education.csv', index=False)

In [None]:
resume_df.shape

In [None]:
resume_df.isna().sum()

In [None]:
print(resume_df[(resume_df.Skills.isna() & resume_df.Education.isna())])

In [None]:
print(resume_df[~(resume_df['Skills'].isna() & resume_df['Education'].isna())].shape)

cv_df = resume_df[~(resume_df['Skills'].isna() & resume_df['Education'].isna())].reset_index(drop=True)
cv_df.head()

In [None]:
cv_df.isna().sum()

In [None]:
cv_df[cv_df.Skills.isna()]

In [None]:
cv_df[cv_df.Education.isna()]

In [None]:
cv_df.Category.value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,8))

cv_df.Category.value_counts().plot(kind='barh')

for index, value in enumerate(cv_df.Category.value_counts().values):
    plt.text(value, index, str(value))

plt.show();

In [None]:
!pip install contractions

In [None]:
import numpy as np
import pandas as pd

import re
import string # for text cleaning
import contractions  # for expanding short form words
from tqdm import tqdm
tqdm.pandas(desc="Progress Bar")

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/content/pdf_extracted_skills_education.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:

print(df[(df.Skills.isna() & df.Education.isna())])

In [None]:
cv_df[cv_df.Education.isna()]

In [None]:
cv_df.Category.value_counts()

In [None]:
plt.figure(figsize=(8,8))

cv_df.Category.value_counts().plot(kind='barh')

for index, value in enumerate(cv_df.Category.value_counts().values):
    plt.text(value, index, str(value))

plt.show();

In [None]:
def text_cleaning(text:str) -> str:
    if pd.isnull(text):
        return

    # lower-case everything
    text = text.lower().strip()

    # For removing puctuations
    translator = str.maketrans('', '', string.punctuation)

    # expand all the short-form words
    text = contractions.fix(text)

    # remove any special chars
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
    text = re.sub(r'\S+@\S+', '', text) # Remove emails
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text) # Remove phone numbers
    text = text.translate(translator) # Remove puctuations
    text = re.sub(r'[^a-zA-Z]', ' ', text) # Remove other non-alphanumeric characters

    return text.strip()

In [None]:
resume_df = cv_df.copy()

# Filling the null values in Skills & Education with Empty String before concatinating them
resume_df = resume_df.fillna(value='')

resume_df['CV'] = resume_df['Skills'] + ' ' + resume_df['Education']
resume_df['CV'] = resume_df['CV'].progress_apply(text_cleaning)

In [None]:
resume_df.head()

In [None]:
category_stats = []

for category in resume_df['Category'].unique():
    category_wise_cv = resume_df[resume_df['Category'] == category]['CV']
    stats = category_wise_cv.str.split().str.len().describe(percentiles=[0.05, 0.5, 0.8, 0.9, 0.95])
    category_stats.append({'Category': category, **stats.to_dict()})

stats_df = pd.DataFrame(category_stats)

# Display the resulting DataFrame
stats_df

In [None]:
# Create a box plot to visualize the distribution of string lengths for each category
plt.figure(figsize=(12, 6))
sns.boxplot(data=stats_df, x='Category', y='mean', palette='viridis')

# Annotate the bars with actual values
for index, row in stats_df.iterrows():
    plt.text(index, row['mean'], f'{round(row["mean"])}', ha='center', va='bottom', fontsize=10)

plt.title('Distribution of Mean Word Length in CVs by Category')
plt.xticks(rotation=80)
plt.xlabel('Category')
plt.ylabel('Mean String Length')
plt.show();

In [None]:
# Reshape the DataFrame to long format for easier plotting
stats_df_long = pd.melt(stats_df, id_vars=['Category'], value_vars=['5%', '80%', '90%'])

# Create a bar plot to compare percentiles of word length for each category
plt.figure(figsize=(18, 6))
sns.barplot(data=stats_df_long, x='Category', y='value', hue='variable', palette='viridis')

plt.title('Comparison of Percentiles of Word Length in CVs by Category')
plt.xticks(rotation=80)
plt.xlabel('Category')
plt.ylabel('Word Length Percentiles')
plt.legend(title='Percentile')
plt.show();

In [None]:
# Create a 5x1 subplot
fig, axes = plt.subplots(5, 1, figsize=(12, 18), sharex=True)

percentiles = ['5%', '50%', '80%', '90%', '95%']
colors = ['blue', 'green', 'orange', 'red', 'purple']

for i, percentile in enumerate(percentiles):
    # Create a bar plot for each percentile
    sns.barplot(data=stats_df, x='Category', y=percentile, color=colors[i], ax=axes[i])

    # Annotate the bars with actual values
    for index, row in stats_df.iterrows():
        value = row[percentile]
        axes[i].text(index, value, f'{round(value)}', ha='center', va='bottom', fontsize=10)

    axes[i].set_title(f'{percentile} Percentile')
    axes[i].set_ylabel('Word Length')

# Set the common x-axis label and rotate x-axis ticks
plt.xlabel('Category')
plt.xticks(rotation=80)
plt.suptitle('Comparison of Percentiles of Word Length in CVs by Category')

# Adjust spacing between subplots
plt.tight_layout(pad=2.0)

# Show the plots
plt.show();

In [None]:
!pip install datasets

In [None]:
jd_df = pd.DataFrame(jd_data)
jd_df.head()

In [None]:
!pip install contractions
import contractions
import string
from tqdm.notebook import tqdm
tqdm.pandas()  # This line enables the progress bar on apply


In [None]:
import pandas as pd

url = "https://huggingface.co/datasets/jacob-hugging-face/job-descriptions/resolve/main/training_data.csv"
jd_df = pd.read_csv(url)

print(jd_df.head())


In [None]:
jd_df['job_description'] = jd_df['job_description'].progress_apply(text_cleaning)

In [None]:
# A lot of position_title are present
jd_df.position_title.unique()

len(jd_df.position_title.unique())

In [None]:
# A lot of companies
jd_df.company_name.unique()

len(jd_df.company_name.unique())

In [None]:
url = "https://huggingface.co/datasets/jacob-hugging-face/job-descriptions/resolve/main/training_data.csv"
jd_df

In [None]:
jd_df = pd.DataFrame(jd_df)
jd_df.head()

In [None]:
# Sample JD

# jd_df['model_response'][0]
print(jd_df['job_description'][0])

In [None]:
def text_cleaning(text:str) -> str:
    if pd.isnull(text):
        return

    # lower-case everything
    text = text.lower().strip()

    # For removing puctuations
    translator = str.maketrans('', '', string.punctuation)

    # expand all the short-form words
    text = contractions.fix(text)

    # remove any special chars
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
    text = re.sub(r'\S+@\S+', '', text) # Remove emails
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text) # Remove phone numbers
    text = text.translate(translator) # Remove puctuations
    text = re.sub(r'[^a-zA-Z]', ' ', text) # Remove other non-alphanumeric characters

    return text.strip()

In [None]:
# We have 15 Resumes where Skills & Education were not extracted
# So, let's remove them
cv_df = df[~(df['Skills'].isna() & df['Education'].isna())].reset_index(drop=True)

# Filling the null values in Skills & Education with Empty String before concatinating them
cv_df = cv_df.fillna(value='')

# Let's stitch together Skills & Education, similar to given in job description.
cv_df['CV'] = cv_df['Skills'] + ' ' + cv_df['Education']

# Doing text cleaning
cv_df['CV'] = cv_df['CV'].progress_apply(text_cleaning)

In [None]:
cv_df.shape

In [None]:
# Sample job descriptions
job_descriptions = jd_df['job_description'].apply(text_cleaning)[:15].to_list() # jd_df['job_description'][:15]

# Sample resumes (replace with your extracted resume data)
resumes = cv_df['CV'].to_list()

In [None]:
job_descriptions

In [None]:
cv_df

In [None]:
!pip install transformers

In [None]:
import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
%%time
from transformers import DistilBertTokenizer, DistilBertModel
import torch


# Initialize the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


# Tokenize and embed job descriptions
job_description_embeddings = []
for description in job_descriptions:
    tokens = tokenizer(description, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    job_description_embeddings.append(embeddings[0])  # Flatten the embeddings to 1D

# Tokenize and embed resumes
resume_embeddings = []
for resume in resumes:
    tokens = tokenizer(resume, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    resume_embeddings.append(embeddings[0])  # Flatten the embeddings to 1D

In [None]:
job_description_embeddings[0].shape, resume_embeddings[0].shape

In [None]:
len(job_description_embeddings), len(resume_embeddings)

In [None]:
# Calculate cosine similarity between job descriptions and resumes
similarity_scores = cosine_similarity(job_description_embeddings, resume_embeddings)
similarity_scores

In [None]:
# Rank candidates for each job description based on similarity scores
num_top_candidates = 5
top_candidates = []

for i, job_description in enumerate(job_descriptions):
    candidates_with_scores = list(enumerate(similarity_scores[i]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates_for_job = candidates_with_scores[:num_top_candidates]
    top_candidates.append(top_candidates_for_job)

# Print the top candidates for each job description
for i, job_description in enumerate(job_descriptions):
    print(f"Top candidates for JD {i+1} - Postition: {jd_df['position_title'][i]}")
    for candidate_index, score in top_candidates[i]:
        print(f"  Candidate {candidate_index + 1} - Similarity Score: {score:.4f} - {cv_df['Category'][candidate_index]}/{cv_df['ID'][candidate_index]}.pdf")
        # print(f"  Resume: {resumes[candidate_index]}")
    print()

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel

# Assuming 'model' is your trained DistilBERT model
model.save_pretrained('./DistilBertModel')
tokenizer.save_pretrained('./DistilBertTokenizer')