# Install BM25

# Load necessary packages

In [4]:
import pandas as pd
from rank_bm25 import BM25Okapi
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import nltk


In [5]:
! pip freeze > requirements.txt

# Load in Dataframe, note each observation in the text column is an individual CEO letter

In [6]:
df = pd.read_json("ceo-letters.jsonl", lines=True)

# Downloading nltk packages for data preperation

In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/wesleybarnes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wesleybarnes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/wesleybarnes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Test Cases

In [8]:
queries = ["What is UnitedHealth Group expected revenue?", "Wells Fargo CEO", "pandemic", "Tim Cook", "strategic planning initiative"]

## Model 1

In [9]:
# Data Prep
stop_words = set(stopwords.words('english'))

texts = df['text'].tolist()

corpus = [[word for word in word_tokenize(text.lower()) if word not in stop_words] for text in texts]

# BM25 Function
bm25 = BM25Okapi(corpus)

def bm25_query1(query, top_n=5):
    tokenized_query = [word for word in word_tokenize(query.lower()) if word not in stop_words]
    scores = bm25.get_scores(tokenized_query)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [(texts[i], scores[i]) for i in top_indexes]


### Model 1 Results

In [10]:
for query in queries:
    print(f"\n Query: {query} \n")
    results = bm25_query1(query)
    for text, score in results:
        print(f"Score: {score:.2f}, Text: {text}")


 Query: What is UnitedHealth Group expected revenue? 

Score: 16.76, Text: As a shareholder of UnitedHealth Group, you play an important role in our company by considering and taking action on the matters set forth in the attached proxy statement. We appreciate the time and attention you invest in making thoughtful decisions.
Score: 10.34, Text: We have deepened our pharmacy penetration into the Health Care Benefits segment through increased cross-selling of medical and pharmacy plans. This is expected to result in approximately $350 million in incremental revenue in 2021.
Score: 9.41, Text: Why?
Score: 9.35, Text: While the passage is not intended as a metaphor, it’s nevertheless a fantastic one, and very relevant to Amazon. I would argue that it’s relevant to all companies and all institutions and to each of our individual lives too. In what ways does the world pull at you in an attempt to make you normal? How much work does it take to maintain your distinctiveness? To keep alive th

## Model 2

In [11]:
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def tokenize_and_lemmatize(text):
    return [lemmatizer.lemmatize(word) for word in word_tokenize(text.lower()) if word.isalpha() and word not in stop_words]

texts = df['text'].tolist()
corpus = [tokenize_and_lemmatize(text) for text in texts]

bm25 = BM25Okapi(corpus)

def truncate_to_full_sentences(text, max_char_length):
    sentences = sent_tokenize(text)
    truncated_text = ""
    for sentence in sentences:
        if len(truncated_text) + len(sentence) <= max_char_length:
            truncated_text += sentence + " "
        else:
            break
    return truncated_text.strip()

def bm25_query2(query, top_n=5, max_char_length=200):
    tokenized_query = tokenize_and_lemmatize(query)
    scores = bm25.get_scores(tokenized_query)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [(truncate_to_full_sentences(texts[i], max_char_length), scores[i]) for i in top_indexes]

### Results -- 200 Character Limit

In [16]:
for query in queries:   
    print(f"\n Query: {query} \n")
    results = bm25_query2(query)
    for text, score in results:
        print(f"Score: {score:.2f}, Text: {text}")


 Query: What is UnitedHealth Group expected revenue? 

Score: 15.60, Text: As a shareholder of UnitedHealth Group, you play an important role in our company by considering and taking action on the matters set forth in the attached proxy statement.
Score: 10.20, Text: We have deepened our pharmacy penetration into the Health Care Benefits segment through increased cross-selling of medical and pharmacy plans.
Score: 9.26, Text: We expect 2022 to be another record year for Polaris, with strong revenues and net income growth.
Score: 6.47, Text: Grew both revenue and adjusted revenue by 9% to $174 billion.
Score: 6.36, Text: President and CEO, Alaska Air Group

 Query: Wells Fargo CEO 

Score: 12.08, Text: We have a new management team running Wells Fargo. Since I joined Wells Fargo in 2019, 11 of 16 Operating Committee members are new to the company.
Score: 11.67, Text: But we won’t win by running the company as we have for the past few decades. If we don’t change how we approach our cust

### Results -- 400 Character Limit

In [13]:
for query in queries:
    print(f"\n Query: {query} \n")
    results = bm25_query2(query, max_char_length=400)
    for text, score in results:
        print(f"Score: {score:.2f}, Text: {text}")


 Query: What is UnitedHealth Group expected revenue? 

Score: 15.60, Text: As a shareholder of UnitedHealth Group, you play an important role in our company by considering and taking action on the matters set forth in the attached proxy statement. We appreciate the time and attention you invest in making thoughtful decisions.
Score: 10.20, Text: We have deepened our pharmacy penetration into the Health Care Benefits segment through increased cross-selling of medical and pharmacy plans. This is expected to result in approximately $350 million in incremental revenue in 2021.
Score: 9.26, Text: We expect 2022 to be another record year for Polaris, with strong revenues and net income growth. Challenges within the supply chain are expected to persist and the Polaris team stands ready to navigate through these challenges just as we did last year.
Score: 6.47, Text: Grew both revenue and adjusted revenue by 9% to $174 billion.
Score: 6.36, Text: President and CEO, Alaska Air Group

 Query: W

Sources used to create this notebook: 

[Source 1](https://www.analyticsvidhya.com/blog/2021/05/build-your-own-nlp-based-search-engine-using-bm25/)

[Source 2](https://pypi.org/project/rank-bm25/)