## Imports

In [18]:
# importing all relevant libraries, packages, and functionalities

import pandas as pd
import re
from textacy import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
# installing NLTK data for lemmatization

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [2]:
# importing the data set

df = pd.read_excel(r'Artificial Intelligence Companies.xlsx')

## Data Preparation

In [4]:
# dropping all companies where Full Description is NaN and resetting index

df_clean = df[df['Full Description'].notna()].reset_index()

In [9]:
# filtering the categories column of the data frame for specific, relevant categories

categories = ['Health', 'Care', 'Diagnostics', 'Biotechnology', 'Hospital', 'Medical', 'Pharmaceutical', 'Genetics']

df_clean['Category incl.'] = df_clean['Categories'].apply(lambda row: any(category in row for category in categories))
df_clean = df_clean.loc[df_clean['Category incl.'] == True].reset_index().drop(columns=['index', 'level_0'])

In [7]:
# explicitly turning descriptions to type str

df_clean['Full Description'] = df_clean['Full Description'].apply(lambda row: str(row))

In [10]:
# converting data frame column to a list called description_list

description_list = df_clean['Full Description'].tolist()

In [11]:
# defining benchmark string

benchmark = 'Our AI-powered solutions address major challenges that are facing the healthcare field. Right now, the demand for diagnostic services is outpacing the supply of experts in the workforce. Developing solutions for managing this ever-increasing workload is a crucial task for the healthcare sector. Moreover, as the workload is growing, diagnostics and treatment are also becoming more complex. Diagnostic experts and physicians need a new set of tools that can handle large volumes of medical data quickly and accurately, allowing you to make more objective treatment decisions based on quantitative data and tailored to the needs of the individual patient. To provide this new toolset, we will need to draw on the power of artificial intelligence (AI).'

In [12]:
# appending benchmark to description_list

description_list.append(str(benchmark))

In [21]:
# defining a function called preprocessor, which takes a list of strings as input and returns a list of processed strings

def preprocessor(lst):
    processed_strings = [re.sub(r'\d+','', preprocessing.normalize_whitespace(preprocessing.remove_punctuation(lst[i].lower()))) for i in range(len(lst))]
    stopwords_removed = [remove_stopwords(processed_strings[i]) for i in range(len(processed_strings))]
    return stopwords_removed

In [None]:
# processing strings to remove whitespace, punctuation, and numbers and converting case to lower

processed_strings_no_stopwords = preprocessor(description_list)

In [25]:
# converting list of preprocessed strings to one-column data frame to prepare for lemmatization

processed_strings_df = pd.DataFrame(processed_strings_no_stopwords, columns=['strings'])

In [26]:
# defining functions to perform lemmatization with appropriate POS tagging

lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()

# converts nltk tags to wordnet tags
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# takes a sentence string as input and returns the lemmatized sentence string
def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [33]:
# performing lemmatization and converting the result back into a list

processed_strings_df['lemmatized strings'] = processed_strings_df['strings'].apply(lambda x: lemmatize_sentence(x))
preprocessed_descriptions_final = processed_strings_df['lemmatized strings'].tolist()

## TFIDF Implementation

In [37]:
# initializing and fitting the TfidfVectorizer using the preprocessed company description strings

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(preprocessed_descriptions_final)

In [39]:
# calculating cosine similarity and converting the result to a np.array

pairwise_similarity = tfidf * tfidf.T
similarity_matrix = pairwise_similarity.toarray()

In [42]:
# isolating the similarities with the benchmark string and converting these values into a data frame

similarity_df = pd.DataFrame(similarity_matrix[-1,:], columns=['Similarity'])

In [43]:
# appending the one-column data frame to df_clean

df_clean['Similarity'] = similarity_df['Similarity']

In [44]:
# sorting the values by similarity

df_clean = df_clean.sort_values(['Similarity'], ascending=False)

In [45]:
# exporting the top 20 results to Excel

df_clean.head(20).to_excel("TFIDF.xlsx", index=None)