In [None]:
import nltk
from nltk import sent_tokenize # this helps to split text into Sentences
from nltk import word_tokenize # this helps to split text into individual Words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import re

#def remove_spaces(text):
  #text = re.sub(r'\s+', ' ', text)
  #return text.strip()

def clean_text(text):
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # convert to lowercase
    #text = text.lower()
    
    # remove non-alphanumeric characters
    #text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Remove phone numbers
    text = re.sub(r'\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}', '', text)
    
    # .*(?:\n|$)
    # Remove specific strings
    text = re.sub(r'Candidate Id:.*|Job Posting Id:.*|Candidate Name:.*(?:\n|$)|&amp|Summary:|Tags:|Skills:.*|Work History:|#|Title:.*(?:\n|$)|Company:.*(?:\n|$)|Description:|From Date:|To Date:|Education History:*(?:\n|$)|School Name:*(?:\n|$)|Degree:*(?:\n|$)|\*', '', text)
    
    # Remove dates
    text = re.sub(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', '', text)
    
    # Remove 'None'
    text = re.sub(r'None', '', text)

    # Remove \n
    #text = re.sub(r'\n', '', text)
    #text = re.sub(r'\'', '', text)
    
    # Remove strings with pattern "four digits - two digits - two digits"
    text = re.sub(r'\d{4}-\d{2}-\d{2}', '', text)
    
    # Remove extra white spaces
    # text = re.sub(r'\s+', ' ', text)

    
    # Replace periods and commas with new lines, except when an Oxford comma is being used
    # text = re.sub(r'(\.|\;|\,)', r'\1\n', text)
    # text = re.sub(r'(?<=[^\s,]), (?=[^\s,\d])', ',\n', text)
    

    return text.strip()

with open('FC_Mike.txt', 'r') as f:
    resume = f.read()

with open('FC.txt', 'r') as f:
    jd = f.read()

# Example usage:
cleaned_resume = clean_text(resume)
cleaned_JD = clean_text(jd)
print(cleaned_resume)

In [None]:
import re

def clean_skills(skill_section):
    # Find string starting with "Skills:" and ending with a newline
    match = re.search(r'Skills:.*\n', skill_section)
    if match:
        # Get the matched string and remove leading/trailing whitespaces
        skill_section = match.group(0).strip()
    else:
        # Return an empty string if no match is found
        skill_section = ""

    skill_section = re.sub(r'Skills:', '', skill_section)

    skill_section = re.sub(r'(\.|\;|\,)', r'\1\n', skill_section)
    skill_section = re.sub(r',', '.', skill_section)


    
    return skill_section

cleaned_skills = clean_skills(resume)

print(cleaned_skills)


In [46]:
def get_similar_phrases(job_desc_file, resume_file, n=5):
    # get sentences from job description
    job_desc_sentences =  nltk.sent_tokenize(job_desc_file)

    # get tf-idf vector representation of job description sentences and resume
    vectorizer = TfidfVectorizer()
    tfidf_job_desc = vectorizer.fit_transform(job_desc_sentences)
    tfidf_resume = vectorizer.transform([resume_file])

    # get similar phrases between job description and resume
    similar_phrases = []
    for i in range(len(job_desc_sentences)):
        sentence = job_desc_sentences[i]
        score = (tfidf_resume * tfidf_job_desc[i].T).toarray()[0][0]
        if score > 0:
            similar_phrases.append((sentence, score))

    # sort the similar phrases in descending order of score
    similar_phrases = sorted(similar_phrases, key=lambda x: x[1], reverse=True)

    # return the top n similar phrases
    return similar_phrases[:n]


if __name__ == '__main__':
    with open('FC_Mike.txt', 'r') as f:
        resume = f.read()

    with open('FC.txt', 'r') as f:
        jd = f.read()

    skills_file = clean_skills(resume)

    resume_file = clean_text(resume)
    job_desc_file = clean_text(jd)

    similar_skills = get_similar_phrases(skills_file, job_desc_file, n=5)
    similar_phrases  = get_similar_phrases(resume_file, job_desc_file, n=5)
    similar_output = similar_skills + similar_phrases

    similar_output = sorted(similar_output, key=lambda x: x[1], reverse =True)

    print('Top similar phrases in the resume:')
    #for i, (phrase, score) in enumerate(similar_skills):
     #   print(f'* {phrase} (score: {score:.2f})')

    for i, (phrase, score) in enumerate(similar_output):
        print(f'* {phrase} (score: {score:.2f})')
        
  # print(f'{i+1}. {phrase} (score: {score:.2f})')


Top similar phrases in the resume:
* Financial Reporting. (score: 0.59)
* Accounting. (score: 0.55)
* Financial Statements. (score: 0.47)
* Responsible for oversight of all accounting functions including accounts receivable, accounts payable, insurance and risk management, and audit. (score: 0.28)
* Responsible for accurate reporting of financial statements. (score: 0.25)
* Achieved savings of $185,000 by working with various members of management. (score: 0.22)
* Managed the implementation of a purchase card system that reduced the number of invoices processed by AP by over 10%. (score: 0.21)
* JOB Cost. (score: 0.21)
* Project team leader for Company wide cost savings model including oversight of software implementation, financial reporting, treasury functions and hourly wage analysis. (score: 0.21)
* Construction Management. (score: 0.20)
