# FindHer Matching Algorithm Research/Implementation

## Main Idea:
* Extract data for user - Ranked choices, Past experience descriptions, past job titles
* Extract data for job posting - company's ranked choices, job description, job title
* TFIDF for experience/job description and or past job titles/job title
* Ranked choicing comparisons for each question


## Extract Data for a User

In [None]:
# This can be done with our backend API calls
# from backend import applicant_answer, applicant_question
import json
import requests

USER_NUM = 1
BASE_API_URL = 'http://localhost:8000'

# this should load user answers as a list
user_answers = json.loads(requests.get("{BASE_API_URL}/all-applicant-answers/{applicant_id}").text)


answers_list = []
for app_answer in user_answers:
    answers_list += app_answer["answers"]

# we can do something similar for job descriptions

: 

## Extract Data for Job Descriptions

In [20]:
import nltk
import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Try Extracting Important Stuff from Job Description

In [13]:
from rake_nltk import Rake

job_description = """Qualifications:
Must be currently pursuing a B.A. or B.S. in Computer Science or similar engineering program with strong academic performance.
Self-starter, analytical, tenacious problem solver.
Strong verbal and written communication skills.
Rigorous attention to detail and focus on quality of deliverables.
Proven team experience and comfort in a team-oriented environment.
Passion for working with technology and excitement for creating high quality consumer technology products.
NET C#, WebApi and SQL Server.
Responsibilities:
The Software Engineering Intern's primary job responsibilities will include:
Analyzing functional requirements and product specifications.
Researching, adopting, and integrating new technologies.
Innovating and improving existing products.
Writing coherent, organized code.
Taking ownership of projects to ensure their successful completion with respect to deadlines.
Collaborating with Quality Assurance in developing a test plan to ensure successful deployment of updates.
Benefits:
Gain well rounded experience: Alarm.com offers a diverse and dynamic environment where you will get the chance to work directly with full-time employees and learn about multiple areas of the business.
Focus on fun: Alarm.com places high value on our team culture."""
# Uses stopwords for english from NLTK, and all puntuation characters by
# default
r = Rake()

# Extraction given the text.
r.extract_keywords_from_text(job_description)

# Extraction given the list of strings where each string is a sentence.
# r.extract_keywords_from_sentences(<list of sentences>)

# To get keyword phrases ranked highest to lowest.
# r.get_ranked_phrases()

# To get keyword phrases ranked highest to lowest with scores.
r.get_ranked_phrases_with_scores()


[(28.0, 'creating high quality consumer technology products'),
 (16.0, 'net c #, webapi'),
 (16.0, 'com places high value'),
 (15.5, 'software engineering intern ’'),
 (15.5, 'gain well rounded experience'),
 (10.5, 'improving existing products'),
 (9.5, 'similar engineering program'),
 (9.0, 'written communication skills'),
 (9.0, 'tenacious problem solver'),
 (9.0, 'integrating new technologies'),
 (9.0, 'analyzing functional requirements'),
 (8.5, 'strong academic performance'),
 (8.5, 'proven team experience'),
 (8.0, 'primary job responsibilities'),
 (7.5, 'ensure successful deployment'),
 (5.0, 'quality assurance'),
 (5.0, 'com offers'),
 (4.5, 'successful completion'),
 (4.5, 'strong verbal'),
 (4.0, 'writing coherent'),
 (4.0, 'work directly'),
 (4.0, 'time employees'),
 (4.0, 'test plan'),
 (4.0, 'team culture'),
 (4.0, 'taking ownership'),
 (4.0, 'sql server'),
 (4.0, 'rigorous attention'),
 (4.0, 'product specifications'),
 (4.0, 'oriented environment'),
 (4.0, 'organized co

In [25]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string


stop_words = set(stopwords.words("english"))
# remove punctuation and tokenize
words = word_tokenize(job_description.translate (str.maketrans('', '', string.punctuation)))
# filtered_list = []
# for word in word_tokenize(job_description):
#     if word.casefold() not in stop_words:
#         filtered_list.append(word)

# alternative with list comprehension
filtered_list = [ word for word in words if word.casefold() not in stop_words]

# stemming?
stemmer = PorterStemmer()

filtered_list = [stemmer.stem(word) for word in filtered_list]


['qualif',
 'must',
 'current',
 'pursu',
 'ba',
 'bs',
 'comput',
 'scienc',
 'similar',
 'engin',
 'program',
 'strong',
 'academ',
 'perform',
 'selfstart',
 'analyt',
 'tenaci',
 'problem',
 'solver',
 'strong',
 'verbal',
 'written',
 'commun',
 'skill',
 'rigor',
 'attent',
 'detail',
 'focu',
 'qualiti',
 'deliver',
 'proven',
 'team',
 'experi',
 'comfort',
 'teamori',
 'environ',
 'passion',
 'work',
 'technolog',
 'excit',
 'creat',
 'high',
 'qualiti',
 'consum',
 'technolog',
 'product',
 'net',
 'c',
 'webapi',
 'sql',
 'server',
 'respons',
 'softwar',
 'engin',
 'intern',
 '’',
 'primari',
 'job',
 'respons',
 'includ',
 'analyz',
 'function',
 'requir',
 'product',
 'specif',
 'research',
 'adopt',
 'integr',
 'new',
 'technolog',
 'innov',
 'improv',
 'exist',
 'product',
 'write',
 'coher',
 'organ',
 'code',
 'take',
 'ownership',
 'project',
 'ensur',
 'success',
 'complet',
 'respect',
 'deadlin',
 'collabor',
 'qualiti',
 'assur',
 'develop',
 'test',
 'plan',
 'e

### Calculate a Similarity Score

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the applicant and job description texts
applicant_answers = [
    "I am a software engineer with experience in Python and machine learning.",
    "I am a software engineer with experience in web development and C#.",
    "I have worked as a project manager for 5 years and have experience in Agile methodologies.",
    "I am a marketing specialist with expertise in social media and digital marketing."
]


# job_descriptions = [
#     "Software Engineer - Python and Machine Learning",
#     "Project Manager - Agile Methodologies",
#     "Marketing Specialist - Social Media and Digital Marketing",
#     "Data Analyst - SQL and Data Visualization"
# ]
job_description2 = [job_description]

# Vectorize the texts using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_applicant_answers = tfidf_vectorizer.fit_transform(applicant_answers)
tfidf_job_descriptions = tfidf_vectorizer.transform(job_description2)

# Compute the cosine similarity between each applicant answer and job description
similarity_scores = cosine_similarity(tfidf_applicant_answers, tfidf_job_descriptions)

# Get the top job matches for each applicant
top_job_matches = similarity_scores.argsort(axis=1)[:,::-1][:,:3]

# Print the results
for i, applicant in enumerate(applicant_answers):
    print(f"\nApplicant {i+1}: {applicant}")
    for j, job_index in enumerate(top_job_matches[i]):
        job = job_description2[job_index][0]
        similarity = similarity_scores[i, job_index]
        print(f"\tTop job match {j+1}: {job} (similarity score: {similarity:.2f})")



Applicant 1: I am a software engineer with experience in Python and machine learning.
	Top job match 1: Q (similarity score: 0.42)

Applicant 2: I am a software engineer with experience in web development and C#.
	Top job match 1: Q (similarity score: 0.46)

Applicant 3: I have worked as a project manager for 5 years and have experience in Agile methodologies.
	Top job match 1: Q (similarity score: 0.26)

Applicant 4: I am a marketing specialist with expertise in social media and digital marketing.
	Top job match 1: Q (similarity score: 0.26)


In [27]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

job_required_skills = ["database", "oracle", "python"]

resume_skills =  [
    ["python", "sql", "machine learning"],
    ["java", "scala", "javascript"]
]

def to_string(list):
    return " ".join(list)

def get_score(skills, job_description):
    text = [skills, job_description]
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(text)

    matchPercentage = cosine_similarity(count_matrix)[0][1] * 100
    return  round(matchPercentage, 2) 

print(get_score(to_string(job_required_skills), to_string(resume_skills[0])))
print(get_score(to_string(job_required_skills), to_string(resume_skills[1])))
print(get_score(to_string([stemmer.stem(word) for word in job_required_skills]), to_string(filtered_list)))


28.87
0.0
0.0


## Ranked Choices Code

In [28]:
import numpy as np

# Define the shared answer choices and their rankings for applicants and job postings
answer_choices = ["Startup", "Mid-sized", "Big company"]
applicant_rankings1 = ["Startup", "Mid-sized", "Big company"]
applicant_rankings2 = ["Startup", "Big company", "Mid-sized"]
applicant_rankings3 = ["Mid-sized", "Startup", "Big company"]
applicant_rankings5 = ["Big company", "Startup", "Mid-sized"]
applicant_rankings4 = ["Big company", "Mid-sized", "Startup"]

job_rankings = ["Startup", "Mid-sized", "Big company"]
# job_rankings = ["Mid-sized", "Big company", "Startup"]


# Define a function to calculate Spearman's rank correlation coefficient
def spearman_rank_correlation(rankings1, rankings2):
    n = len(rankings1)
    ranked_diffs = [((rankings1[i] - rankings2[i])**2) / (i+1) for i in range(n)]
    # print("ranked diffs", ranked_diffs)
    # print(sum(ranked_diffs))
    # print("previous", 1 - (6 * sum(ranked_diffs)) / (n * (n**2 - 1)))
    # ranked_diffs = np.multiply(ranked_diffs, weightings)
    # print(sum(ranked_diffs))
    return 1 - (6 * sum(ranked_diffs)) / (n * (n**2 - 1))

# Compute the Spearman's rank correlation coefficient between the rankings of shared answer choices
# shared_rankings1 = [answer_choices.index(choice) for choice in applicant_rankings]
# shared_rankings2 = [answer_choices.index(choice) for choice in job_rankings]

for applicant_rankings in [applicant_rankings1, applicant_rankings2, applicant_rankings3, applicant_rankings4, applicant_rankings5]:
    shared_rankings1 = [applicant_rankings.index(choice) for choice in job_rankings]
    # shared_rankings2 = [job_rankings.index(choice) for choice in answer_choices]
    shared_rankings2 = [*range(len(job_rankings))]

    correlation = spearman_rank_correlation(shared_rankings1, shared_rankings2)
    print(f"Spearman's rank correlation coefficient: {correlation:.2f}")

Spearman's rank correlation coefficient: 1.00
Spearman's rank correlation coefficient: 0.79
Spearman's rank correlation coefficient: 0.62
Spearman's rank correlation coefficient: -0.33
Spearman's rank correlation coefficient: 0.29
