In [79]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import string
import argparse

In [80]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words and token not in string.punctuation]
    return ' '.join(tokens)

def preprocess_bigrams(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words and token not in string.punctuation]
    bigrams = [f"{tokens[i]} {tokens[i + 1]}" for i in range(len(tokens) - 1)]
    return ' '.join(bigrams)

def tfidf_search(keyword, descriptions):
    vectorizer = TfidfVectorizer(preprocessor=preprocess_bigrams)
    tfidf_matrix = vectorizer.fit_transform(descriptions)
    keyword_vector = vectorizer.transform([preprocess_bigrams(keyword)])

    scores = np.dot(tfidf_matrix, keyword_vector.T).toarray().flatten()
    sorted_indices = np.argsort(scores)[::-1]

    results = []
    for idx in sorted_indices[:5]:
        if scores[idx] > 0:
            results.append((scores[idx], descriptions.iloc[idx].strip()))

    return results

def bm25_search(keyword, descriptions):
    tokenized_descriptions = [word_tokenize(desc.lower()) for desc in descriptions]
    bm25 = BM25Okapi(tokenized_descriptions)
    scores = bm25.get_scores(word_tokenize(keyword.lower()))

    sorted_indices = np.argsort(scores)[::-1]

    results = []
    for idx in sorted_indices[:5]:
        if scores[idx] > 0:
            results.append((scores[idx], descriptions.iloc[idx].strip()))

    return results

def get_and_clean_data(file_path):
    data = pd.read_csv(file_path)
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' ' * len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

   

def search_and_display_results(keyword, data_file_path):
    cleaned_description = get_and_clean_data(data_file_path)
    
    # TF-IDF Search
    tfidf_results = tfidf_search(keyword, cleaned_description)
    print("\nTop 5 matching jobs (TF-IDF):", keyword)
    for score, result in tfidf_results:
        print(f"Score: {score:.4f}, {result}")

    # BM25 Search
    bm25_results = bm25_search(keyword, cleaned_description)
    print("\nTop 5 matching jobs (BM25):", keyword)
    for score, result in bm25_results:
        print(f"Score: {score:.4f}, {result}")

`BM25` is a ranking model used by search engines to provide accurate and relevant search results. 

It scores documents based on their `term frequencies and document lengths`, while also considering the importance of a `term in the entire corpus and the impact of document length on relevance scoring`.

`BM25` includes a term saturation function to mitigate the impact of excessively high term frequencies. This makes it particularly effective in certain contexts, such as when dealing with large amounts of data or complex queries.

In [81]:
keyword = input("Enter the keyword to search: ")
data_file_path = "../Resource/software_developer_united_states_1971_20191023_1.csv"
search_and_display_results(keyword, data_file_path)


Top 5 matching jobs (TF-IDF): python developer
Score: 0.3125, software development experience in any of the programming languages java or net or c or python etc  python scripting experience   languages  proficiency with python scripting language  experience with and use of linuxunix scripting languages shell script korn bash etc  development environments   system integration  working knowledge and development experience in a linux environment redhat or centos
Score: 0.2531, reference    1900306  title   python developer software developer    location   burbank ca    experience level     start date   01032019      description      basic qualifications  qualifications  minimum 4 years of experience in python  proficient in django python and nodejs  postgresql  consuming restful apis  celery or other task queues  gitlab test runner or related testing automation   preferred qualifications  preferred qualifications  prior experience as technical project manager is a plus  data sciencemachi