In [57]:
# Cell 1: Imports and Setup
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import wordnet, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Uncomment the following line if you need to download the NLTK data
# nltk.download('all')


In [58]:
# Cell 2: Data Loading and Preprocessing
data = pd.read_csv('Resume Ranking Data set.csv')
data_copy = data.copy()


In [59]:
# Drop unnecessary columns
columns_to_drop = ['accomplishments_segment', 'education_segment', 'emails', 'misc_segment', 'name', 
                   'objectives_segment', 'phone', 'projects_segment', 'skills_segment', 'text', 
                   'university_0', 'university_1', 'university_2', 'university_3', 
                   'university_4', 'university_5', 'url', 'work_segment']
data.drop(columns=columns_to_drop, axis=1, inplace=True)

In [60]:
# Fill missing values without using inplace=True
data['degree'] = data['degree'].fillna(data.degree.mode()[0])
data['links'] = data['links'].fillna('Missing')
data['work_experience'] = data['work_experience'].fillna(0)
data['job_titles'] = data['job_titles'].fillna('No Job')

In [61]:
# Extract unique degrees
all_degrees = ','.join(data.degree.dropna().tolist())
all_degrees = [re.sub(r"[\s.]", "", degree).upper() for degree in all_degrees.split(',')]
unique_degrees = set(all_degrees)

In [62]:
# Initialize new columns
data['bachelor_degrees'] = 'No Degree'
data['master_degrees'] = 'No Degree'
data['doctorate_degrees'] = 'No Degree'
data['profiles'] = 'No Profile'

In [63]:
# Cell 3: Degree and Profile Classification
def classify_degrees(row):
    degrees = re.sub(r"[\s.]", "", row['degree']).upper().split(',')
    for degree in degrees:
        if degree in ['BE', 'BS', 'BSC', 'BTECH']:
            row['bachelor_degrees'] = degree if row['bachelor_degrees'] == 'No Degree' else row['bachelor_degrees'] + ' , ' + degree
        elif degree in ['ME', 'MS', 'MSC', 'MTECH']:
            row['master_degrees'] = degree if row['master_degrees'] == 'No Degree' else row['master_degrees'] + ' , ' + degree
        elif degree == 'PHD':
            row['doctorate_degrees'] = degree if row['doctorate_degrees'] == 'No Degree' else row['doctorate_degrees'] + ' , ' + degree
    return row

data = data.apply(classify_degrees, axis=1)

In [64]:
def classify_profiles(row):
    links = re.sub(r"[\s]", "", row['links']).split(',')
    for link in links:
        if 'github' in link:
            row['profiles'] = 'Github'
        elif 'linkedin' in link:
            row['profiles'] = 'Linkedin' if row['profiles'] == 'No Profile' else row['profiles'] + ' , ' + 'Linkedin'
    return row

data = data.apply(classify_profiles, axis=1)

In [65]:
# Drop unnecessary columns after processing
data.drop(['degree', 'links'], axis=1, inplace=True)

# Cell 4: Lemmatization and TF-IDF Cosine Similarity
lemmatizer = WordNetLemmatizer()
analyzer = CountVectorizer().build_analyzer()


In [66]:
def lemmatized_words(doc):
    doc = doc.lower()
    return [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in analyzer(doc) if word not in set(stopwords.words('english'))]

In [67]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tf_idf_cosine_similarity(job_desc, resumes):
    tf_idf_vectorizer = TfidfVectorizer(analyzer=lemmatized_words,lowercase=True)
    tf_idf_job_desc_vector = tf_idf_vectorizer.fit_transform(np.asarray([job_description]))
    tf_idf_resume_vectors = tf_idf_vectorizer.transform(resumes).todense()
    return [cosine_similarity(tf_idf_job_desc_vector, tf_idf_resume_vectors[i])[0][0] for i in range(len(tf_idf_resume_vectors))]

In [None]:
# Load job description
with open('Job Description.txt', 'r', encoding='utf-8') as f:
    job_description = ' '.join([line.strip() for line in f])

# Prepare resume text data
resume_texts = data_copy['text'].values

# Define and fit TF-IDF vectorizer
vectorizer = LemmaTfidfVectorizer(lowercase=True)
job_desc_vector = vectorizer.fit_transform([job_description])
resume_vectors = vectorizer.transform(resume_texts)

# Calculate cosine similarities
cosine_similarities = [cosine_similarity(job_desc_vector, resume_vector)[0][0] for resume_vector in resume_vectors]

# Zip and sort resume ratings
zipped_resume_ratings = zip(cosine_similarities, data_copy['name'], range(len(data)))
sorted_resume_ratings = sorted(zipped_resume_ratings, key=lambda x: x[0], reverse=True)

# Display sorted resume ratings
sorted_resume_ratings_df = pd.DataFrame(sorted_resume_ratings, columns=['Cosine Similarity', 'Name', 'Index'])
sorted_resume_ratings_df.head()


In [56]:
# Cell 6: Result Display
# Compute resume scores
resume_scores = [round(score * 100, 2) for score in cosine_similarities]
result_df = pd.concat([data_copy.name, pd.DataFrame(resume_scores, columns=['resume_score(%)'])], axis=1)

print(result_df)

                      name  resume_score(%)
0             Prasad Nagar            46.88
1        Software Engineer            53.57
2        Ashwin Khandelwal            49.79
3     Computer Engineering            60.64
4                Python            44.27
...                    ...              ...
1568          Gaurav Arora            35.27
1569         Ayokunle Paul            33.46
1570       DevOps Engineer            54.32
1571        Udaya SaiKiran            42.44
1572             D. Shinde            39.39

[1573 rows x 2 columns]
