## Imports 

In [None]:
# importing all relevant libraries, packages, and functionalities

import pandas as pd
import numpy as np
import re
from textacy import preprocessing
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist

In [None]:
# installing NLTK data for lemmatization

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
# importing the data set

df = pd.read_excel(r'Artificial Intelligence Companies.xlsx')

## Data Preparation 

In [None]:
# dropping all companies where Full Description is NaN and resetting index

df_clean = df[df['Full Description'].notna()].reset_index()

In [None]:
# filtering the categories column of the data frame for specific, relevant categories

categories = ['Health', 'Care', 'Diagnostics', 'Biotechnology', 'Hospital', 'Medical', 'Pharmaceutical', 'Genetics']

df_clean['Category incl.'] = df_clean['Categories'].apply(lambda row: any(category in row for category in categories))
df_clean = df_clean.loc[df_clean['Category incl.'] == True].reset_index().drop(columns=['index', 'level_0'])

In [None]:
# explicitly turning descriptions to type str

df_clean['Full Description'] = df_clean['Full Description'].apply(lambda row: str(row))

In [None]:
# converting data frame column to a list called description_list

description_list = df_clean['Full Description'].tolist()

In [None]:
# defining benchmark string

benchmark = 'Our AI-powered solutions address major challenges that are facing the healthcare field. Right now, the demand for diagnostic services is outpacing the supply of experts in the workforce. Developing solutions for managing this ever-increasing workload is a crucial task for the healthcare sector. Moreover, as the workload is growing, diagnostics and treatment are also becoming more complex. Diagnostic experts and physicians need a new set of tools that can handle large volumes of medical data quickly and accurately, allowing you to make more objective treatment decisions based on quantitative data and tailored to the needs of the individual patient. To provide this new toolset, we will need to draw on the power of artificial intelligence (AI).'

In [None]:
# appending benchmark to description_list

description_list.append(str(benchmark))

In [None]:
# defining a function called preprocessor, which takes a list of strings as input and returns a list of processed strings

def preprocessor(lst):
    processed_strings = [re.sub(r'\d+','', preprocessing.normalize_whitespace(preprocessing.remove_punctuation(lst[i].lower()))) for i in range(len(lst))]
    stopwords_removed = [remove_stopwords(processed_strings[i]) for i in range(len(processed_strings))]
    return stopwords_removed

In [None]:
# processing strings to remove whitespace, punctuation, numbers, and stopwords and converting case to lower

processed_strings_no_stopwords = preprocessor(description_list)

In [None]:
# converting list of preprocessed strings to one-column data frame to prepare for lemmatization

processed_strings_df = pd.DataFrame(processed_strings_no_stopwords, columns=['strings'])

In [None]:
# defining functions to perform lemmatization with appropriate POS tagging

lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()

# converts nltk tags to wordnet tags
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# takes a sentence string as input and returns the lemmatized sentence string
def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [None]:
# performing lemmatization and converting the result back into a list

processed_strings_df['lemmatized strings'] = processed_strings_df['strings'].apply(lambda x: lemmatize_sentence(x))
preprocessed_descriptions_final = processed_strings_df['lemmatized strings'].tolist()

## BERT Implementation

In [None]:
# loading BERT's sentence transformer and saving it to model - executing will take a few minutes

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
# defining corpus as all preprocessed description minus the benchmark string

corpus = preprocessed_descriptions_final[:-1]

In [None]:
# encoding the model on corpus and saving the result to sentence_embeddings

sentence_embeddings = model.encode(corpus)

In [None]:
# creating a list, queries, containing the benchmark string, encoding the model on queries, and defining the number of top matches as 20

queries = [preprocessed_descriptions_final[-1]]
query_embeddings = model.encode(queries)
number_top_matches = 20

In [None]:
# printing the top 20 similar companies based on BERT's sentence transformer model

for query, query_embedding in zip(queries, query_embeddings):
    distances = cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 20 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(corpus[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))
        print("\n\n======================\n\n")