<a href="https://colab.research.google.com/github/Harsh6959Pathak/streamlit-example/blob/master/RESUME_TESTER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import re
import ast
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

nltk.download('stopwords')

# Load Dataset
df = pd.read_csv("/content/resume_dataset.csv.csv")

# Preprocessing Functions
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# Extract relevant features
df['skills'] = df['skills'].apply(lambda x: ' '.join(ast.literal_eval(x)) if isinstance(x, str) else '')
df['career_objective'] = df['career_objective'].apply(clean_text)
df['degree_names'] = df['degree_names'].apply(clean_text)
df['major_field_of_studies'] = df['major_field_of_studies'].apply(clean_text)
df['skills'] = df['skills'].apply(clean_text)
df['positions'] = df['positions'].apply(clean_text)
df['responsibilities'] = df['responsibilities'].apply(clean_text)

# Combine features for resume text
df['resume_text'] = df[['career_objective', 'degree_names', 'major_field_of_studies', 'skills', 'positions', 'responsibilities']].agg(' '.join, axis=1)

# Load Job Description (Mock JD Example)
jd = """
We are looking for a Software Engineer with expertise in Python, Machine Learning, and Data Analysis.
Candidate should have experience in data preprocessing, model training, and software development.
Qualifications: B.Tech in Computer Science or related field.
"""

# Preprocess JD
jd_cleaned = clean_text(jd)

# TF-IDF Vectorization
tfidf = TfidfVectorizer()
resume_tfidf = tfidf.fit_transform(df['resume_text'])
jd_tfidf = tfidf.transform([jd_cleaned])

# Compute Cosine Similarity
cosine_scores = cosine_similarity(jd_tfidf, resume_tfidf)[0]
df['similarity_score'] = cosine_scores

# BERT-based Similarity (Optional for better matching)
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
resume_embeddings = bert_model.encode(df['resume_text'].tolist(), convert_to_tensor=True)
jd_embedding = bert_model.encode([jd_cleaned], convert_to_tensor=True)
bert_scores = cosine_similarity(jd_embedding.cpu().numpy(), resume_embeddings.cpu().numpy())[0]
df['bert_score'] = bert_scores

# Final Score (Weighted Combination)
df['final_score'] = 0.5 * df['similarity_score'] + 0.5 * df['bert_score']

# Select Top 3 Candidates
top_candidates = df.nlargest(3, 'final_score')[['resume_text', 'final_score']]
print(top_candidates)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

                                            resume_text  final_score
1099   btech computers data science data analysis da...     0.512236
8700  experienced software professional looking data...     0.511704
8285   btech computers data science data analysis da...     0.507674


In [1]:
# Install required libraries (if not already installed)
!pip install pandas nltk scikit-learn sentence-transformers openai torch

# Import Libraries
import pandas as pd
import re
import ast
import numpy as np
import nltk
import torch
import openai
from google.colab import files
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Download NLTK stopwords
nltk.download('stopwords')

# File Upload (Google Colab)
uploaded = files.upload()

# Load dataset
dataset_filename = list(uploaded.keys())[0]  # Get uploaded filename
df = pd.read_csv(dataset_filename)
print(f"✅ Successfully loaded: {dataset_filename}")
display(df.head())  # Show first few rows of dataset

# Preprocessing Function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# Ensure 'skills' column is properly formatted
if 'skills' in df.columns:
    df['skills'] = df['skills'].apply(lambda x: ' '.join(ast.literal_eval(x)) if isinstance(x, str) else '')

# Apply cleaning to relevant columns
columns_to_clean = ['career_objective', 'degree_names', 'major_field_of_studies', 'skills', 'positions', 'responsibilities']
for col in columns_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(clean_text)

# Combine features for resume text
df['resume_text'] = df[columns_to_clean].agg(' '.join, axis=1)

# Load Job Description (Mock JD Example)
jd = """[Job Description Text]"""

# Preprocess JD
jd_cleaned = clean_text(jd)

# TF-IDF Vectorization
tfidf = TfidfVectorizer()
resume_tfidf = tfidf.fit_transform(df['resume_text'])
jd_tfidf = tfidf.transform([jd_cleaned])

# Compute Cosine Similarity
cosine_scores = cosine_similarity(jd_tfidf, resume_tfidf)[0]
df['similarity_score'] = cosine_scores

# Upgrade Model for Better Accuracy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
resume_embeddings = bert_model.encode(df['resume_text'].tolist(), convert_to_tensor=True)
jd_embedding = bert_model.encode([jd_cleaned], convert_to_tensor=True)
bert_scores = cosine_similarity(jd_embedding.cpu().numpy(), resume_embeddings.cpu().numpy())[0]
df['bert_score'] = bert_scores

# Final Score (Weighted Combination)
df['final_score'] = 0.5 * df['similarity_score'] + 0.5 * df['bert_score']

# Select Top 3 Candidates
top_candidates = df.nlargest(3, 'final_score')

# Skill Gap Analysis
required_skills = ["Python", "Machine Learning", "Cloud Computing"]
df['missing_skills'] = df['skills'].apply(lambda x: [skill for skill in required_skills if skill not in x])

# OpenAI GPT Integration for Ranking (Requires OpenAI API Key)
def rank_resume(resume_text, jd_text):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an expert HR recruiter."},
                {"role": "user", "content": "Rank this resume based on its suitability for the job:\n{resume_text}\n Job Description: {jd_text}"}
            ]
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        return f"Error: {e}"

df['gpt_rank'] = df['resume_text'].apply(lambda x: rank_resume(x, jd_cleaned))

# Display Results in Google Colab
print("\n📌 **Top Candidates:**")
display(top_candidates[['resume_text', 'final_score', 'missing_skills']])

print("✅ Process Completed Successfully!")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving resume_dataset.csv.csv to resume_dataset.csv (1).csv
✅ Successfully loaded: resume_dataset.csv (1).csv


Unnamed: 0,address,career_objective,skills,educational_institution_name,degree_names,passing_years,educational_results,result_types,major_field_of_studies,professional_company_names,...,online_links,issue_dates,expiry_dates,﻿job_position_name,educationaL_requirements,experiencere_requirement,age_requirement,responsibilities.1,skills_required,matched_score
0,,Big data analytics working and database wareho...,"['Big Data', 'Hadoop', 'Hive', 'Python', 'Mapr...",['The Amity School of Engineering & Technology...,['B.Tech'],['2019'],['N/A'],[None],['Electronics'],['Coca-COla'],...,,,,Senior Software Engineer,B.Sc in Computer Science & Engineering from a ...,At least 1 year,,Technical Support\nTroubleshooting\nCollaborat...,,0.85
1,,Fresher looking to join as a data analyst and ...,"['Data Analysis', 'Data Analytics', 'Business ...","['Delhi University - Hansraj College', 'Delhi ...","['B.Sc (Maths)', 'M.Sc (Science) (Statistics)']","['2015', '2018']","['N/A', 'N/A']","['N/A', 'N/A']","['Mathematics', 'Statistics']",['BIB Consultancy'],...,,,,Machine Learning (ML) Engineer,M.Sc in Computer Science & Engineering or in a...,At least 5 year(s),,Machine Learning Leadership\nCross-Functional ...,,0.75
2,,,"['Software Development', 'Machine Learning', '...","['Birla Institute of Technology (BIT), Ranchi']",['B.Tech'],['2018'],['N/A'],['N/A'],['Electronics/Telecommunication'],['Axis Bank Limited'],...,,,,"Executive/ Senior Executive- Trade Marketing, ...",Master of Business Administration (MBA),At least 3 years,,"Trade Marketing Executive\nBrand Visibility, S...",Brand Promotion\nCampaign Management\nField Su...,0.416667
3,,To obtain a position in a fast-paced business ...,"['accounts payables', 'accounts receivables', ...","['Martinez Adult Education, Business Training ...",['Computer Applications Specialist Certificate...,['2008'],[None],[None],['Computer Applications'],"['Company Name ï¼ City , State', 'Company Name...",...,,,,Business Development Executive,Bachelor/Honors,1 to 3 years,Age 22 to 30 years,Apparel Sourcing\nQuality Garment Sourcing\nRe...,Fast typing skill\nIELTSInternet browsing & on...,0.76
4,,Professional accountant with an outstanding wo...,"['Analytical reasoning', 'Compliance testing k...",['Kent State University'],['Bachelor of Business Administration'],[None],['3.84'],[None],['Accounting'],"['Company Name', 'Company Name', 'Company Name...",...,[None],[None],"['February 15, 2021']",Senior iOS Engineer,Bachelor of Science (BSc) in Computer Science,At least 4 years,,iOS Lifecycle\nRequirement Analysis\nNative Fr...,iOS\niOS App Developer\niOS Application Develo...,0.65


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

KeyboardInterrupt: 