In [None]:
import nltk
from nltk import sent_tokenize # this helps to split text into Sentences
from nltk import word_tokenize # this helps to split text into individual Words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import re
import os


def get_similar_phrases(job_desc_file, resume_file, n=5):
    # get sentences from job description

    job_desc_sentences =  nltk.sent_tokenize(job_desc_file)

    # get tf-idf vector representation of job description sentences and resume
    vectorizer = TfidfVectorizer()
    tfidf_job_desc = vectorizer.fit_transform(job_desc_sentences)
    tfidf_resume = vectorizer.transform([resume_file])

    # get similar phrases between job description and resume
    similar_phrases = []


    for i in range(len(job_desc_sentences)):
        sentence = job_desc_sentences[i]
        score = (tfidf_resume * tfidf_job_desc[i].T).toarray()[0][0]
        if score >0:
            similar_phrases.append((sentence, score))

    # sort the similar phrases in descending order of score
    similar_phrases = sorted(similar_phrases, key=lambda x: x[1], reverse=True)

    # return the top n similar phrases
    return similar_phrases[:n]


def clean_text_mod(text):
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove phone numbers
    text = re.sub(r'\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}', '', text)
    
    # Remove specific strings and insert newline character
    text = re.sub(r'(Skills.*(?:\n|$)|Job.*(\n)|Other Address:.*(?:\n|$)|Candidate Rank:.*(?:\n|$)|<br><br>.*(?:\n|$)|<br>\.*(?:\n|$)|Candidate Id:.*(?:\n|$)|Job Posting Id:.*(?:\n|$)|Candidate Name:.*(?:\n|$)|Summary:|Tags:|Work History:|#|Title:.*(?:\n|$)|Company:.*(?:\n|$)|Description:|From Date:|To Date:|Education History:*(?:\n|$)|School Name:*(?:\n|$)|School Name:.*(?:\n|$)|Degree:.*(?:\n|$)|\*)', r'\n', text)
    
    # Remove dates
    text = re.sub(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', '', text)

    # Add space after .
    text = re.sub(r'\.(?![^\S\r\n])', ' . ', text)

    # Add space after :	
    text = re.sub(r'\:(?![^\S\r\n])', ': ', text)

    text = re.sub(r'&amp;|&amp', '&', text)

    text = re.sub(r'\*|\||;\.', ' . ', text)

    # Remove 'None'
    text = re.sub(r'None', '', text)

    # Remove strings with pattern "four digits - two digits - two digits"
    text = re.sub(r'\d{4}-\d{2}-\d{2}', '', text)

    if '•' or '●' or '*' in text:
      text = re.sub(r'•|●|\*', '\.\n', text)
      #text = re.sub(r'\n', '.', text)
      text = re.sub(r'\.{2,}', '\.', text)


    text = re.sub(r'( - )| -|\^-', r'\n', text)

    text = re.sub(r'\s\.', '.', text)

    text = re.sub(r'\n', '.', text)
    text = re.sub(r'\.{2,}', '\.', text)



    return text.strip()


def clean_skills(skill_section):
    # Find string starting with "Skills:" and ending with a newline
    match = re.search(r'Skills:.*\n', skill_section)
    if match:
        # Get the matched string and remove leading/trailing whitespaces
        skill_section = match.group(0).strip()
    else:
        # Return an empty string if no match is found
        skill_section = ""

    skill_section = re.sub(r'Skills:', '', skill_section)

    skill_section = re.sub(r'(\.|\;|\,)', r'\1\n', skill_section)
    skill_section = re.sub(r',', '.', skill_section)


    
    return skill_section


if __name__ == '__main__':
    # Get path to input and output folders
    input_folder = '/content/66fe463b-bdcf-4ce3-bcf4-f586f40a91da/candidates'
    output_folder = '/content/66fe463b-bdcf-4ce3-bcf4-f586f40a91da/output'
    jd_file = '/content/66fe463b-bdcf-4ce3-bcf4-f586f40a91da/66fe463b-bdcf-4ce3-bcf4-f586f40a91da.txt'
    
    # Loop through each resume file in the input folder
    for resume_file_name in os.listdir(input_folder):
        # Read the resume file
        with open(os.path.join(input_folder, resume_file_name), 'r') as f:     
            resume = f.read()

        # Read the job description file
        with open(jd_file, 'r') as f:
            jd = f.read()

        # Clean the resume file and job description file
        skills_file = clean_skills(resume)
        resume_file = clean_text_mod(resume)
        job_desc_file = clean_text_mod(jd)

        similar_skills = []

        # Determine which function to use based on the contents of the resume file
        if skills_file != "" and resume_file != "":
            similar_skills = get_similar_phrases(skills_file, job_desc_file, n=5)
            similar_phrases = get_similar_phrases(resume_file, job_desc_file, n=5)
            similar_output = similar_skills + similar_phrases

        elif skills_file == "" and resume_file != "":
            print(f"There are not sufficient skills included in {resume_file_name}, so the output generated the most matching phrases from the resume")
            similar_phrases = get_similar_phrases(resume_file, job_desc_file, n=8)
            similar_output = similar_phrases

        elif resume_file == "" and skills_file != "":
            print(f"There are not sufficient phrases included in {resume_file_name}, so the output generated the most matching skills from the resume")
            similar_skills = get_similar_phrases(skills_file, job_desc_file, n=8)
            similar_output = similar_skills

        else:
            print(f"The provided candidate record for {resume_file_name} is not sufficiently length to provide a sufficient output.")
            continue

        similar_output = sorted(similar_output, key=lambda x: x[1], reverse=True)

        # Write the output to a file
        output_file_name = f"{os.path.splitext(resume_file_name)[0]}_output.txt"
        with open(os.path.join(output_folder, output_file_name), 'w') as f:
            f.write('\n'.join([f'*{phrase.strip()}(score:{score:.2f})' for phrase, score in similar_output]))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


FileNotFoundError: ignored