In [2]:
import pandas as pd

Data Fetching

In [3]:
job_description_df = pd.read_csv("job_description.csv", encoding='ISO-8859-1')

# Display the first few rows of the file to understand its structure
job_description_df = job_description_df.iloc[:, :2]
job_description_df.head()

Unnamed: 0,Job Title,Job Description
0,Software Engineer,Description:\nWe are seeking a skilled Softwa...
1,Data Scientist,Job Description:\nWe are looking for a skilled...
2,Product Manager,Description:\nWe are seeking an innovative and...
3,Cloud Engineer,Description:\nWe are looking for a skilled Clo...
4,Cybersecurity Analyst,Description:\nWe are looking for a skilled Cyb...


Data Cleaning

In [4]:
# Clean the 'Job Description' column by removing unwanted text (like 'Description:')
job_description_df['Cleaned Job Description'] = job_description_df['Job Description'].str.replace(r'^( Description:|Description:|Job Description:)\s*', '', regex=True)

# Display the first few rows of the cleaned job descriptions
job_description_df[['Job Title', 'Cleaned Job Description']].head()


Unnamed: 0,Job Title,Cleaned Job Description
0,Software Engineer,We are seeking a skilled Software Engineer to ...
1,Data Scientist,We are looking for a skilled Data Scientist to...
2,Product Manager,We are seeking an innovative and strategic Pro...
3,Cloud Engineer,We are looking for a skilled Cloud Engineer to...
4,Cybersecurity Analyst,We are looking for a skilled Cybersecurity Ana...


In [5]:
job_description_df['Cleaned Job Description'] = job_description_df['Cleaned Job Description'].str.lower()

Define and Apply Summarization using BERT

In [6]:
# Define the BERT summarization function
from transformers import pipeline

# Load the BERT summarizer pipeline
summarizer = pipeline("summarization")

def summarize_with_bert(job_description):
    summary = summarizer(job_description, max_length=150, min_length=30, do_sample=False)
    return summary[0]['summary_text']

# Apply BERT summarization to the cleaned job descriptions
summarized_descriptions = job_description_df['Cleaned Job Description'].apply(summarize_with_bert)

# Add the summaries to the dataframe
job_description_df['Summary'] = summarized_descriptions

# Display the job titles along with their summaries
job_description_df[['Job Title', 'Summary']].head()


  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
Your max_length is set to 150, but your input_length is only 142. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=71)


Unnamed: 0,Job Title,Summary
0,Software Engineer,We are seeking a skilled software engineer to...
1,Data Scientist,We are looking for a skilled data scientist t...
2,Product Manager,We are seeking an innovative and strategic pr...
3,Cloud Engineer,We are looking for a skilled cloud engineer t...
4,Cybersecurity Analyst,We are looking for a skilled cybersecurity an...


In [7]:
import fitz  # PyMuPDF

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    full_text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        full_text += page.get_text()
    return full_text

# Example usage with one of the uploaded PDFs
pdf_path = 'C1061.pdf'  # Replace with the path to other PDFs for further extraction
extracted_text = extract_text_from_pdf(pdf_path)

# Print the first 500 characters of the extracted text for review
print(extracted_text)


Candidate Resume (ID: C1061)
Name: Alyssa Chavez
Email: alyssachavez88@gmail.com
Phone: +1-465-3587
Education
Diploma in Software Engineering (2013-2015)
Hands-on experience in full-stack web development and mobile app creation.
Work Experience
Data Scientist at ABC Inc. (2019-2023)
Built predictive models that enhanced decision-making processes, reducing operational costs by
25%.
Skills
Cybersecurity - Skilled in penetration testing, risk assessment, and securing enterprise networks
against cyber threats.
Certifications
AWS Certified Solutions Architect - Validated expertise in designing and deploying scalable AWS
solutions, optimizing performance and security.
Achievements
Published a research paper on AI ethics - Contributed to an AI ethics framework adopted by industry
leaders, shaping responsible AI development.
Tech Stack
Java, Spring Boot, MySQL, Kafka, Azure DevOps



In [8]:
import spacy

# Load SpaCy model for NER
nlp = spacy.load('en_core_web_sm')

# Example function to extract skills, qualifications, and experience from a CV
def extract_candidate_info(cv_text):
    doc = nlp(cv_text)

    # Extract named entities such as skills and qualifications
    skills = [ent.text for ent in doc.ents if ent.label_ == "ORG" or ent.label_ == "GPE"]  # Assume skills are organization or locations
    qualifications = [ent.text for ent in doc.ents if ent.label_ == "WORK_OF_ART"]  # You can modify for qualifications
    experience = [sent.text for sent in doc.sents if "experience" in sent.text.lower()]  # Simple check for experience-related sentences

    return {"Skills": skills, "Qualifications": qualifications, "Experience": experience}

# Extract candidate information
candidate_info = extract_candidate_info(extracted_text)
print(candidate_info)


{'Skills': ['Candidate Resume', 'Software Engineering', 'ABC Inc.', 'Skills\nCybersecurity - Skilled', 'AI', 'AI', 'AI'], 'Qualifications': [], 'Experience': ['Hands-on experience in full-stack web development and mobile app creation.\n', 'Work Experience\nData Scientist at ABC Inc. (2019-2023)\nBuilt predictive models that enhanced decision-making processes, reducing operational costs by\n25%.\n']}


In [9]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a text
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the embeddings of the [CLS] token as the sentence representation
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

# Function to calculate match score using BERT embeddings
def calculate_match_score_bert(job_desc, candidate_info):
    # Get BERT embeddings for the job description
    job_desc_embedding = get_bert_embeddings(job_desc)

    # Combine candidate skills and qualifications into a single text
    candidate_skills_qualifications = " ".join(candidate_info["Skills"] + candidate_info["Qualifications"])

    # Get BERT embeddings for the candidate skills and qualifications
    candidate_embedding = get_bert_embeddings(candidate_skills_qualifications)

    # Calculate cosine similarity between job description and candidate's info
    similarity_score = cosine_similarity(job_desc_embedding, candidate_embedding)

    return similarity_score[0][0]  # Return the match score

# # Example usage:
# # Sample job description
# job_desc = "We are looking for a skilled Software Engineer with experience in Python, TensorFlow, and cloud computing."

# # Sample extracted candidate info (from CV text)
# candidate_info = {
#     "Skills": ["Python", "TensorFlow", "Machine Learning", "Cloud Computing"],
#     "Qualifications": ["Ph.D. in Artificial Intelligence", "Master's in Software Engineering"]
# }

# # Calculate match score between job description and candidate
# match_score = calculate_match_score_bert(job_desc, candidate_info)
# print(f"Match Score: {match_score:.4f}")

In [10]:
# Example usage:
# Extracted job description and candidate's CV text
job_scores = []
for i in range(len(job_description_df)):
  job_desc = job_description_df['Summary'][i]
  candidate_cv_text = extracted_text  # Replace with actual CV text

  # Extract candidate's information
  candidate_info = extract_candidate_info(candidate_cv_text)

  # Calculate the match score
  match_score = calculate_match_score_bert(job_desc, candidate_info)

  job_scores.append((job_description_df['Job Title'][i], match_score))

  # print(f"Match Score for {job_description_df['Job Title'][i]}: {match_score}")
print(f"Top three Job Titles best for this CV are:")
top_3_jobs = sorted(job_scores, key=lambda x: x[1], reverse=True)[:3]
for job, score in top_3_jobs:
    print(f"{job} : {score:.4f}")

Top three Job Titles best for this CV are:
Software Engineer : 0.7787
Full Stack Developer : 0.7701
Data Scientist : 0.7690


In [11]:
job_data_df = job_description_df.iloc[:, [0, 3]]

In [12]:
job_data = job_data_df.to_dict(orient='records')

In [13]:
import os

# Define the folder where your PDF files are located
pdf_folder = 'data/CVs1'  # Assuming all the uploaded PDF files are here

# Process all uploaded PDF resumes and calculate match scores for each job description
def process_resumes_and_match_jobs(pdf_folder, job_descriptions_list):
    # Initialize an empty dictionary to store match scores for each CV
    match_scores_dict = {}

    # Loop through all PDF files (CVs) in the folder
    for filename in os.listdir(pdf_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, filename)
            
            # Extract text from the CV PDF
            extracted_text = extract_text_from_pdf(pdf_path)
            
            # Extract candidate information (skills, qualifications, experience) from the CV text
            candidate_info = extract_candidate_info(extracted_text)
            
            # Initialize a list to hold the similarity scores for this CV against each job description
            match_scores = []
            
            # Loop through all job descriptions and calculate similarity score for this CV
            for job_desc_dict in job_descriptions_list:
                job_desc = job_desc_dict['Summary']  # Job description text
                match_score = calculate_match_score_bert(job_desc, candidate_info)
                match_scores.append(match_score)
            
            # Store the similarity scores for this CV (use filename as the row)
            match_scores_dict[filename] = match_scores
    
    # Convert the dictionary into a DataFrame (rows: Job Titles, columns: CVs)
    job_titles = [job_desc_dict['Job Title'] for job_desc_dict in job_descriptions_list]
    match_scores_df = pd.DataFrame(match_scores_dict, index=job_titles)
    
    return match_scores_df
# Sample job description
# job_desc = "We are looking for a skilled Software Engineer with experience in Python, TensorFlow, and cloud computing."

# Process all the resumes in the folder and calculate match scores
match_scores = process_resumes_and_match_jobs(pdf_folder, job_data)

# # Display candidates and their match scores
# for result in match_scores:
#     print(f"Candidate: {result['Candidate']}, Match Score: {result['Match Score']:.4f}")

print(match_scores)


                               C1061.pdf  C1070.pdf  C1080.pdf  C1161.pdf  \
Software Engineer               0.778708   0.771432   0.761751   0.749274   
Data Scientist                  0.769007   0.778762   0.759380   0.756590   
Product Manager                 0.747488   0.743438   0.733571   0.714118   
Cloud Engineer                  0.720324   0.724967   0.725760   0.703108   
Cybersecurity Analyst           0.693514   0.696730   0.690857   0.671440   
Machine Learning Engineer       0.717283   0.721352   0.716502   0.696403   
DevOps Engineer                 0.710961   0.715087   0.723346   0.697465   
Full Stack Developer            0.770143   0.761208   0.754789   0.746209   
Big Data Engineer               0.745814   0.752461   0.742343   0.732773   
AI Researcher                   0.677243   0.690298   0.671046   0.662196   
Database Administrator          0.768704   0.780081   0.761226   0.760077   
Network Engineer                0.755478   0.753032   0.746147   0.731488   

In [14]:
match_scores.to_csv('match_scores.csv', index=True)


In [15]:
import os

# Define the folder where your PDF files are located
pdf_folder_kn = 'data/trial'  # Assuming all the uploaded PDF files are here

# Process all uploaded PDF resumes and calculate match scores for each job description
def process_resumes_and_match_jobs_kn(pdf_folder_kn, job_descriptions_list_kn):
    # Initialize an empty dictionary to store match scores for each CV
    match_scores_dict_kn = {}

    # Loop through all PDF files (CVs) in the folder
    for filename in os.listdir(pdf_folder_kn):
        if filename.endswith('.pdf'):
            pdf_path_kn = os.path.join(pdf_folder_kn, filename)
            
            # Extract text from the CV PDF
            extracted_text_kn = extract_text_from_pdf(pdf_path_kn)
            
            # Extract candidate information (skills, qualifications, experience) from the CV text
            candidate_info_kn = extract_candidate_info(extracted_text_kn)
            
            # Initialize a list to hold the similarity scores for this CV against each job description
            match_scores_kn = []
            
            # Loop through all job descriptions and calculate similarity score for this CV
            for job_desc_dict in job_descriptions_list_kn:
                job_desc_kn = job_desc_dict['Summary']  # Job description text
                match_score_kn = calculate_match_score_bert(job_desc_kn, candidate_info_kn)
                match_scores_kn.append(match_score_kn)
            
            # Store the similarity scores for this CV (use filename as the row)
            match_scores_dict_kn[filename] = match_scores_kn
    
    # Convert the dictionary into a DataFrame (rows: Job Titles, columns: CVs)
    job_titles_kn = [job_desc_dict['Job Title'] for job_desc_dict in job_descriptions_list_kn]
    match_scores_df_kn = pd.DataFrame(match_scores_dict_kn, index=job_titles_kn)
    
    return match_scores_df_kn
# Sample job description
# job_desc = "We are looking for a skilled Software Engineer with experience in Python, TensorFlow, and cloud computing."

# Process all the resumes in the folder and calculate match scores
match_scores_kn = process_resumes_and_match_jobs(pdf_folder_kn, job_data)

# # Display candidates and their match scores
# for result in match_scores:
#     print(f"Candidate: {result['Candidate']}, Match Score: {result['Match Score']:.4f}")

print(match_scores_kn)


                               C1061.pdf  C1070.pdf  C1080.pdf  C1161.pdf  \
Software Engineer               0.778708   0.771432   0.761751   0.749274   
Data Scientist                  0.769007   0.778762   0.759380   0.756590   
Product Manager                 0.747488   0.743438   0.733571   0.714118   
Cloud Engineer                  0.720324   0.724967   0.725760   0.703108   
Cybersecurity Analyst           0.693514   0.696730   0.690857   0.671440   
Machine Learning Engineer       0.717283   0.721352   0.716502   0.696403   
DevOps Engineer                 0.710961   0.715087   0.723346   0.697465   
Full Stack Developer            0.770143   0.761208   0.754789   0.746209   
Big Data Engineer               0.745814   0.752461   0.742343   0.732773   
AI Researcher                   0.677243   0.690298   0.671046   0.662196   
Database Administrator          0.768704   0.780081   0.761226   0.760077   
Network Engineer                0.755478   0.753032   0.746147   0.731488   

In [16]:
import pandas as pd

# Threshold value
threshold = 0.75

# Function to shortlist candidates based on the threshold
def shortlist_candidates_by_threshold(match_scores_df, threshold):
    shortlisted_candidates = {}

    # Loop through each job title (row)
    for job_title in match_scores_df.index:
        # Get the CVs for this job title where the match score is above the threshold
        shortlisted_candidates[job_title] = match_scores_df.columns[
            match_scores_df.loc[job_title] >= threshold
        ].tolist()

    return shortlisted_candidates

# Get the shortlisted candidates
shortlisted_threshold = shortlist_candidates_by_threshold(match_scores, threshold)

# Display the shortlisted candidates for each job title
# for job_title, candidates in shortlisted_threshold.items():
#     print(f"{job_title}: {', '.join(candidates)}")
shortlisted_threshold


{'Software Engineer': ['C1061.pdf',
  'C1070.pdf',
  'C1080.pdf',
  'C1164.pdf',
  'C1191.pdf',
  'C1212.pdf',
  'C1228.pdf',
  'C1236.pdf',
  'C1320.pdf',
  'C1446.pdf',
  'C1487.pdf',
  'C1499.pdf',
  'C1547.pdf',
  'C1627.pdf',
  'C1677.pdf',
  'C1701.pdf',
  'C1781.pdf',
  'C1789.pdf',
  'C1796.pdf',
  'C2139.pdf',
  'C2144.pdf',
  'C2235.pdf',
  'C2250.pdf',
  'C2287.pdf',
  'C2430.pdf',
  'C2546.pdf',
  'C2603.pdf',
  'C2607.pdf',
  'C2652.pdf',
  'C2720.pdf',
  'C2775.pdf',
  'C2808.pdf',
  'C2836.pdf',
  'C2838.pdf',
  'C3019.pdf',
  'C3127.pdf',
  'C3142.pdf',
  'C3169.pdf',
  'C3226.pdf',
  'C3315.pdf',
  'C3363.pdf',
  'C3416.pdf',
  'C3445.pdf',
  'C3464.pdf',
  'C3539.pdf',
  'C3570.pdf',
  'C3620.pdf',
  'C3627.pdf',
  'C3717.pdf',
  'C3761.pdf',
  'C3771.pdf',
  'C3830.pdf',
  'C3863.pdf',
  'C3899.pdf',
  'C3912.pdf',
  'C3922.pdf',
  'C4024.pdf',
  'C4194.pdf',
  'C4256.pdf',
  'C4277.pdf',
  'C4307.pdf',
  'C4331.pdf',
  'C4439.pdf',
  'C4444.pdf',
  'C4460.pdf',
  'C

In [21]:
shortlisted_threshold_df = pd.DataFrame(shortlisted_threshold.items(), columns=['Job Title', 'Shortlisted Candidates'])

In [22]:
shortlisted_threshold_df.to_csv('shortlisted.csv', index=True)

In [17]:
import pandas as pd

# Function to shortlist top N candidates for each job title
def shortlist_candidates_by_number(match_scores_df, num_candidates):
    shortlisted_candidates = {}

    # Loop through each job title (row)
    for job_title in match_scores_df.index:
        # Get the top N CVs with the highest similarity scores for this job title
        top_candidates = match_scores_df.loc[job_title].sort_values(ascending=False).head(num_candidates).index.tolist()
        shortlisted_candidates[job_title] = top_candidates

    return shortlisted_candidates

# Number of candidates to shortlist for each job title
num_candidates = 5  # Adjust this value based on how many candidates you want to shortlist

# Get the shortlisted candidates
shortlisted = shortlist_candidates_by_number(match_scores, num_candidates)

# Display the shortlisted candidates for each job title
# for job_title, candidates in shortlisted.items():
#     print(f"{job_title}: {', '.join(candidates)}")

shortlisted

{'Software Engineer': ['C9146.pdf',
  'C9282.pdf',
  'C5951.pdf',
  'C3363.pdf',
  'C9228.pdf'],
 'Data Scientist': ['C9146.pdf',
  'C6583.pdf',
  'C5951.pdf',
  'C3142.pdf',
  'C8063.pdf'],
 'Product Manager': ['C9282.pdf',
  'C9146.pdf',
  'C3363.pdf',
  'C5951.pdf',
  'C1164.pdf'],
 'Cloud Engineer': ['C9146.pdf',
  'C4627.pdf',
  'C9282.pdf',
  'C7094.pdf',
  'C6768.pdf'],
 'Cybersecurity Analyst': ['C9146.pdf',
  'C6768.pdf',
  'C9282.pdf',
  'C5951.pdf',
  'C7492.pdf'],
 'Machine Learning Engineer': ['C4627.pdf',
  'C9146.pdf',
  'C6768.pdf',
  'C9282.pdf',
  'C6583.pdf'],
 'DevOps Engineer': ['C6768.pdf',
  'C7492.pdf',
  'C9282.pdf',
  'C7094.pdf',
  'C5390.pdf'],
 'Full Stack Developer': ['C9282.pdf',
  'C9146.pdf',
  'C3363.pdf',
  'C7093.pdf',
  'C5638.pdf'],
 'Big Data Engineer': ['C9146.pdf',
  'C6583.pdf',
  'C9282.pdf',
  'C8063.pdf',
  'C2430.pdf'],
 'AI Researcher': ['C9146.pdf',
  'C6583.pdf',
  'C8063.pdf',
  'C5951.pdf',
  'C2808.pdf'],
 'Database Administrator': ['

In [18]:
import os
import re
import pandas as pd
import fitz  # PyMuPDF (for PDF extraction)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    full_text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        full_text += page.get_text()
    return full_text

# Function to extract email address from the text
def extract_email(text):
    # Regular expression to match email addresses
    email_regex = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_regex, text)
    if match:
        return match.group(0)  # Return the first matched email
    return None

# Function to process all the CVs in the folder and extract emails
def extract_emails_from_cvs(pdf_folder):
    data = []  # List to store CV filename and email

    # Loop through all files in the directory
    for filename in os.listdir(pdf_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, filename)
            
            # Extract text from the CV PDF
            extracted_text = extract_text_from_pdf(pdf_path)
            
            # Extract email address from the extracted text
            email = extract_email(extracted_text)
            
            if email:  # If an email is found, add to the data
                data.append({"CV Filename": filename, "Email": email})

    # Create a DataFrame from the extracted data
    email_df = pd.DataFrame(data)
    return email_df

# Example usage
pdf_folder = 'data/CVs1'  # Replace with the path to your CVs folder
email_df = extract_emails_from_cvs(pdf_folder)

# Display the DataFrame with CV filenames and their corresponding emails
print(email_df)

email_dict = email_df.to_dict(orient='records')

# Display the email dictionary
email_dict


    CV Filename                      Email
0     C1061.pdf   alyssachavez88@gmail.com
1     C1070.pdf  scottsaunders13@gmail.com
2     C1080.pdf     pamelakerr20@gmail.com
3     C1161.pdf  richardmolina72@gmail.com
4     C1164.pdf  deborahfoster16@gmail.com
..          ...                        ...
195   C9779.pdf    jasondawson17@gmail.com
196   C9884.pdf    eugenehardy24@gmail.com
197   C9897.pdf    lancevaldez52@gmail.com
198   C9919.pdf     tonizamora21@gmail.com
199   C9945.pdf      erikapaul51@gmail.com

[200 rows x 2 columns]


[{'CV Filename': 'C1061.pdf', 'Email': 'alyssachavez88@gmail.com'},
 {'CV Filename': 'C1070.pdf', 'Email': 'scottsaunders13@gmail.com'},
 {'CV Filename': 'C1080.pdf', 'Email': 'pamelakerr20@gmail.com'},
 {'CV Filename': 'C1161.pdf', 'Email': 'richardmolina72@gmail.com'},
 {'CV Filename': 'C1164.pdf', 'Email': 'deborahfoster16@gmail.com'},
 {'CV Filename': 'C1191.pdf', 'Email': 'traceyjones36@gmail.com'},
 {'CV Filename': 'C1212.pdf', 'Email': 'robertflores62@gmail.com'},
 {'CV Filename': 'C1228.pdf', 'Email': 'jeffreygordon62@gmail.com'},
 {'CV Filename': 'C1236.pdf', 'Email': 'amandaschneider31@gmail.com'},
 {'CV Filename': 'C1320.pdf', 'Email': 'ryanflowers36@gmail.com'},
 {'CV Filename': 'C1446.pdf', 'Email': 'danielbailey75@gmail.com'},
 {'CV Filename': 'C1487.pdf', 'Email': 'garydavis62@gmail.com'},
 {'CV Filename': 'C1499.pdf', 'Email': 'brianhurley66@gmail.com'},
 {'CV Filename': 'C1547.pdf', 'Email': 'tracysanchez80@gmail.com'},
 {'CV Filename': 'C1627.pdf', 'Email': 'dennissch

In [23]:
email_df.to_csv('email_data.csv', index=True)


In [28]:
import pandas as pd
import json

# Load the CSV file into a DataFrame
csv_file = 'data/shortlisted.csv'  # Adjust this to the correct path if needed
df = pd.read_csv(csv_file)

# Convert the 'Shortlisted Candidates' column from string representation of list to actual list
df['Shortlisted Candidates'] = df['Shortlisted Candidates'].apply(lambda x: json.loads(x.replace("'", "\"")))

# Convert the DataFrame to a dictionary
shortlisted_dict = dict(zip(df['Job Title'], df['Shortlisted Candidates']))

# Convert the dictionary to a JSON string
json_data = json.dumps(shortlisted_dict, indent=2)

# Save the JSON data to a file
json_file = 'shortlisted_candidates.json'
with open(json_file, 'w') as f:
    f.write(json_data)

print(f"CSV data has been successfully converted to JSON and saved as {json_file}")


CSV data has been successfully converted to JSON and saved as shortlisted_candidates.json


In [24]:
import pandas as pd

# Step 1: Read the CSV file into a DataFrame
csv_file = 'data/shortlisted.csv'  # Replace with your CSV file path
dfd = pd.read_csv(csv_file)

# Step 2: Convert the DataFrame to a JSON file
json_file = 'output.json'  # Replace with the desired JSON file path
dfd.to_json(json_file, orient='records', lines=True)

print(f"CSV file converted to JSON and saved as {json_file}")


CSV file converted to JSON and saved as output.json


In [19]:
import smtplib
import ssl
from email.message import EmailMessage

# Sender and receiver email addresses
email_sender = 'karthiksamala33@gmail.com'  # Replace with your email address
email_password = 'jgcmdplcdzqgosnx'  # Replace with your email password (or App Password if 2FA enabled)
email_receiver = 'ksamala1802@gmail.com'  # Replace with the recipient's email address

# Add SSL (layer of security)
context = ssl.create_default_context()

def send_email_to_shortlisted_candidates(shortlisted_candidates, job_title, email_df, sender_email, sender_password):
  try:
    with smtplib.SMTP_SSL('smtp.gmail.com', 465, context=context) as smtp:
      smtp.login(email_sender, email_password)

    # Email content template
      subject = f"Congratulations! You are shortlisted for {job_title}"
      body = f"Dear Candidate,\n\nWe are pleased to inform you that you have been shortlisted for the role of {job_title}. We will contact you soon for the next steps.\n\nBest Regards,\nThe Hiring Team"
      
      body = f"""

            Dear [Candidate's Name],

            Congratulations! You've been shortlisted for the {job_title} role at Accenture. The interview will be scheduled in the coming days, and we'll share further details soon.

            Please feel free to prepare, and let us know if you have any questions.

            Best regards,
            Team Octopus
            HR Team
            Accenture

            """


      for candidate in shortlisted_candidates:
        # Find the email address for this candidate (CV filename) from email_df
        candidate_email = email_df[email_df['CV Filename'] == candidate]['Email'].values[0]

        # Create the email message
        em = EmailMessage()
        em['From'] = email_sender
        em['To'] = candidate_email
        em['Subject'] = subject
        em.set_content(body)

        # Send the email
        smtp.sendmail(sender_email, candidate_email, em.as_string())
      
      print(f"Emails sent successfully to shortlisted candidates for {job_title}.")
  except Exception as e:
    print(f"Error sending email: {e}")

for job_title, candidates in shortlisted.items():
    if candidates:  # If there are any shortlisted candidates for this job title
        send_email_to_shortlisted_candidates(candidates, job_title, email_df, email_sender, email_password)




Emails sent successfully to shortlisted candidates for Software Engineer.
Emails sent successfully to shortlisted candidates for Data Scientist.
Emails sent successfully to shortlisted candidates for Product Manager.
Emails sent successfully to shortlisted candidates for Cloud Engineer.
Emails sent successfully to shortlisted candidates for Cybersecurity Analyst.
Emails sent successfully to shortlisted candidates for Machine Learning Engineer.
Emails sent successfully to shortlisted candidates for DevOps Engineer.
Emails sent successfully to shortlisted candidates for Full Stack Developer.
Emails sent successfully to shortlisted candidates for Big Data Engineer.
Emails sent successfully to shortlisted candidates for AI Researcher.
Emails sent successfully to shortlisted candidates for Database Administrator.
Emails sent successfully to shortlisted candidates for Network Engineer.
Emails sent successfully to shortlisted candidates for Software Architect.
Emails sent successfully to shor

KeyboardInterrupt: 