# Topic Modeling

In [None]:
!pip install wikipedia -U -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone


In [None]:
import numpy as np
import pandas as pd
import random
import plotly.express as px
import wikipedia
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
wikipedia.search('Data science in healthcare',results = 10)

['Artificial intelligence in healthcare',
 'Health informatics',
 'Healthcare in the United States',
 'HealthCare.gov',
 'Information science',
 'Outline of health sciences',
 'HCA Healthcare',
 'Big data',
 'Healthcare in India',
 'Fast Healthcare Interoperability Resources']

In [None]:
wikipedia.page('Artificial intelligence in healthcare').content

'Artificial intelligence in healthcare is an overarching term used to describe the use of machine-learning algorithms and software, or artificial intelligence (AI), to mimic human cognition in the analysis, presentation, and comprehension of complex medical and health care data. Specifically, AI is the ability of computer algorithms to approximate conclusions based solely on input data.\nThe primary aim of health-related AI applications is to analyze relationships between clinical techniques and patient outcomes. AI programs are applied to practices such as diagnostics, treatment protocol development, drug development, personalized medicine, and patient monitoring and care. What differentiates AI technology from traditional technologies in healthcare is the ability to gather data, process it, and produce a well-defined output to the end-user. AI does this through machine learning algorithms and deep learning. These processes can recognize patterns in behavior and create their own logic

In [None]:
article_list = wikipedia.search('Data science in healthcare',results = 10)

In [None]:
article_text = []
for articles in article_list:
  article_text.append(wikipedia.page(articles).content)

In [None]:
df = pd.DataFrame(article_text,columns = ['article'])

In [None]:
df['article'][0]

'Artificial intelligence in healthcare is an overarching term used to describe the use of machine-learning algorithms and software, or artificial intelligence (AI), to mimic human cognition in the analysis, presentation, and comprehension of complex medical and health care data. Specifically, AI is the ability of computer algorithms to approximate conclusions based solely on input data.\nThe primary aim of health-related AI applications is to analyze relationships between clinical techniques and patient outcomes. AI programs are applied to practices such as diagnostics, treatment protocol development, drug development, personalized medicine, and patient monitoring and care. What differentiates AI technology from traditional technologies in healthcare is the ability to gather data, process it, and produce a well-defined output to the end-user. AI does this through machine learning algorithms and deep learning. These processes can recognize patterns in behavior and create their own logic

In [None]:
count_vec = CountVectorizer()

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
stop_words = set(stopwords.words("english"))


# Lowercasing
text = df['article'][0]
text = text.lower()
print(text)

# Removing punctuation
# text = "This is a sample text."
text = text.translate(text.maketrans("", "", string.punctuation))
print(text)

# Removing stop words
stop_words = set(stopwords.words("english"))
filtered_text = " ".join([word for word in text.split() if word not in stop_words])
print(filtered_text)

# Stemming and it's type
#Porter Stemmer
#Lancaster Stemmer
#Snowball Stemmer


stemmer = PorterStemmer()
stemmed_text = " ".join([stemmer.stem(word) for word in filtered_text.split()])
print(stemmed_text)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_text = " ".join([lemmatizer.lemmatize(word) for word in filtered_text.split()])
print(lemmatized_text)

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
class LowercaseTransformer():
    def transform(self, X, **transform_params):
        return X.lower()
    def fit(self, X, y=None, **fit_params):
        return self

class remove_punctuation():
    def transform(self, X, **transform_params):
        return X.translate(X.maketrans("", "", string.punctuation))
    def fit(self, X, y=None, **fit_params):
        return self
        
class remove_stopwords():
    def transform(self, X, **transform_params):
        return " ".join([word for word in X.split() if word not in stop_words])
    def fit(self, X, y=None, **fit_params):
        return self

class Lemmatizer():
    def transform(self, X, **transform_params):
        return " ".join([lemmatizer.lemmatize(word,pos = 'v') for word in X.split()])
    def fit(self, X, y=None, **fit_params):
        return self

class Stemming():
    def transform(self, X, **transform_params):
        return " ".join([stemmer.stem(word) for word in X.split()])
    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
pipeline = Pipeline([
    ('lower', LowercaseTransformer())
    ,('punct', remove_punctuation())
    ,('remsw',remove_stopwords())
    ,('Lem',Lemmatizer())
])

In [None]:
preprocessed_texts = pipeline.fit_transform(df['article'][0])

In [None]:
preprocessed_texts

'artificial intelligence healthcare overarch term use describe use machinelearning algorithms software artificial intelligence ai mimic human cognition analysis presentation comprehension complex medical health care data specifically ai ability computer algorithms approximate conclusions base solely input data primary aim healthrelated ai applications analyze relationships clinical techniques patient outcomes ai program apply practice diagnostics treatment protocol development drug development personalize medicine patient monitor care differentiate ai technology traditional technologies healthcare ability gather data process produce welldefined output enduser ai machine learn algorithms deep learn process recognize pattern behavior create logic gain useful insights predictions machine learn model must train use extensive amount input data ai algorithms behave differently humans two ways 1 algorithms literal goal set algorithm learn exclusively input data understand program 2 deep learn

In [None]:
df2 = pd.DataFrame(df['article'].apply(lambda x : pipeline.fit_transform(x)))

In [None]:
from nltk.tokenize import word_tokenize

# Define a function for tokenization
def tokenize(X):
    return word_tokenize(X)

In [None]:
tokenize('This is me')

['This', 'is', 'me']

In [None]:
from sklearn.datasets import make_multilabel_classification
X, _ = make_multilabel_classification(random_state=0)

In [None]:
X[0][0]

3.0

In [None]:
df2['article']

0    artificial intelligence healthcare overarch te...
1    health informatics field science engineer aim ...
2    healthcare unite state far outspent nation mea...
3    healthcaregov health insurance exchange websit...
4    information science also know information stud...
5    follow outline provide overview topical guide ...
6    hca healthcare american forprofit operator hea...
7    big data primarily refer data set large comple...
8    india multipayer universal health care model p...
9    fast healthcare interoperability resources fhi...
Name: article, dtype: object

In [None]:
count_vec = CountVectorizer(stop_words='english')
DTM = count_vec.fit_transform(df2['article'].values.astype('U'))
DTM

<10x6198 sparse matrix of type '<class 'numpy.int64'>'
	with 12128 stored elements in Compressed Sparse Row format>

In [None]:
count_vec.get_feature_names_out()[1000],len(count_vec.get_feature_names_out())

('big', 6198)

In [None]:
LDA = LatentDirichletAllocation(n_components=5,random_state=1,max_iter=5,learning_method="online",learning_offset=50.0,)
LDA.fit(DTM)

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          max_iter=5, n_components=5, random_state=1)

In [None]:
LDA.components_.argsort()[: -20 - 1 : -1]

array([[4849, 6114, 5078, ..., 1000, 3066, 1755],
       [5353, 3973, 5580, ..., 1755, 2749, 2747],
       [5806, 4204, 1606, ..., 3646, 1755, 2747],
       [5183, 4898, 2364, ..., 3646, 1755, 2747],
       [3619, 1762,  954, ..., 1180, 2749, 2747]])

In [None]:
# n_components = Topics
# argsort = will sort the probabilities and provides their index
# [::-1] = To make it descending
# [:10] = Top 10 features names index will be given

LDA.components_[0].argsort()[::-1][:10]

array([2747, 2749, 1180, 3125, 5372, 3646, 5147, 1636, 5438, 2992])

In [None]:
top_feature_names = [count_vec.get_feature_names_out()[i] for i in LDA.components_[0].argsort()[::-1][:10]]
top_feature_names

['health',
 'healthcare',
 'care',
 'insurance',
 'state',
 'medical',
 'service',
 'cost',
 'study',
 'include']

In [None]:
#@title Top 10 Feature Names
component = 0 #@param {type:"slider", min:0, max:5, step:1}
df = pd.DataFrame({'Feature Name': top_feature_names, 'Frequency': sorted(LDA.components_[component][:10],reverse = True)})

# Use Plotly Express to create a horizontal bar plot
fig = px.bar(df, x='Frequency', y='Feature Name', orientation='h')
fig.show()