<a href="https://colab.research.google.com/github/MJSRep/NLP/blob/main/Topicmodelling18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# INSTALL
!pip install bertopic --upgrade bertopic

In [4]:
# IMPORT
import pandas as pd # pandas data frame
import re # regular expressions
import os # operating system
import json # json!

In [5]:
# MOUNT
from google.colab import drive # change for a server location in PROD
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
# CHANGE DIR
os.chdir('/content/gdrive/MyDrive/Models')


In [None]:
# CHECKS
#print (os.getcwd())
#print(os.listdir())

In [7]:
# PREPROCESSING - this step is not strictly required with BERTopic unless we want to use a custom stop words library
import nltk
from nltk.corpus import stopwords

# Download the stopwords list if necessary
nltk.download('stopwords')

# Define the stop words to remove
stop_words = set(stopwords.words('english'))

# Load the CSV file into a DataFrame
data = pd.read_csv('SA_20181206.csv')

# Remove stopwords from the text column
data['Scientific_Abstract'] = data['Scientific_Abstract'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Save the modified DataFrame to a new CSV file
data.to_csv('SA_20181206_without_stopwords.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from random import seed
# TRAIN/FIT

# Import libraries
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

# Load data
abstracts = pd.read_csv ("SA_20181206_without_stopwords.csv")

# Extract abstracts
abstracts = abstracts["Scientific_Abstract"].tolist()

# Convert data to strings
abstracts = [str(doc) for doc in abstracts]

# Vectorizer
# need more I/O for ngram_range 1,3
vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words="english")

# Embedding
# embedding_model = sentence_model ("all-MiniLM-L6-v2") # use default
# The default embedding model for SentenceTransformer in BERTopic is all-MiniLM-L6-v2 for English documents and paraphrase-multilingual-MiniLM-L12-v2 for multilingual documents.
# These models are both pre-trained transformer-based models that are specifically designed for sentence embedding tasks.
# They are able to capture semantic relationships between sentences, which makes them well-suited for topic modeling.

# Set the random seed to a fixed value to get reproducible results
umap_model = UMAP(random_state=42)

# Dimensionality Reduction example
# umap = UMAP(n_neighbors=15,
#             n_components=5,
#             min_dist=0.0,
#             metric='cosine',
#             low_memory=False,
#             random_state=1337)
# model = BERTopic(language="multilingual", umap_model=umap)
# topics, probs = model.fit_transform(content)


# Create a BERTopic instance
topic_model = BERTopic(
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    language="english")

# Fit the BERTopic model - fit_transform is a single step to fit and predict
topics, probs = topic_model.fit_transform(abstracts)


In [9]:
# SAVE
from bertopic import BERTopic
topic_model.save("Screening_Model_v16")



In [None]:
# RAW TOPICS (TOP 30)
topic_model.get_topic_info().head(30)

In [None]:
# GET TOP 10 WORDS FOR TOPIC 0
topic_model.get_topic(0)[:10]

In [None]:
# BAR CHARTS OF TOP 20 TOPICS (10 WORDS)
topic_model.visualize_barchart (width=280, height=330, top_n_topics=30, n_words=10)

In [None]:
#TOPIC LABELS
topic_labels = topic_model.generate_topic_labels(nr_words=3,
                                                 topic_prefix=True,
                                                 word_length=10,
                                                 separator=", ")

topic_model.set_topic_labels(topic_labels)

topic_model.set_topic_labels({0: "0 - Women & Pregnancy", 1: "1 - Asthma & COPD"})

# Re-run Barchart visualisation
topic_model.visualize_barchart (width=280, height=330, top_n_topics=30, n_words=10, custom_labels=True)

In [None]:
#HEATMAP (20 CLUSTERS)
topic_model.visualize_heatmap(n_clusters=20, custom_labels=True)

In [None]:
# DOCUMENTS AND TOPICS
topic_model.visualize_documents(abstracts, topics=list(range(30)),custom_labels=True, height=600)

In [None]:
# HIERARCHICAL CLUSTERING
topic_model.visualize_hierarchy(custom_labels=True)

In [None]:
# INTERTOPIC DISTANCE MAP
topic_model.visualize_topics(custom_labels=True)

In [None]:
# PROJECTS, TOPICS, PROBABILITY (EXCLUDE -1 UNKNOWN OUTLIERS)

# Combine the topic IDs, topic probabilities, and the desired column from your source data
abstract_data = pd.read_csv ("SA_20181206_without_stopwords.csv")

# Extract the ProjectID column
ProjectID = abstract_data["ProjectID"]

topic_df = pd.DataFrame({
"ProjectID": abstract_data["ProjectID"],
"Topic_ID": topics,
"Topic_Prob": probs,
})
filtered_df = topic_df[topic_df['Topic_ID'] != -1]
print (filtered_df.sample(30))

In [None]:
# PROJECTS, TOPICS, PROBABILITY (FOR TOPIC 0 - Women & Pregnancy)

# Combine the topic IDs, topic probabilities, and the desired column from your source data
abstract_data = pd.read_csv ("SA_20181206_without_stopwords.csv")

# Extract the ProjectID column
ProjectID = abstract_data["ProjectID"]

topic_df = pd.DataFrame({
"ProjectID": abstract_data["ProjectID"],
"Topic_ID": topics,
"Topic_Prob": probs,
"Document": abstracts
})
filtered_df = topic_df[topic_df['Topic_ID'] == 0]
print (filtered_df.sample(30))

In [None]:
# DOCS PER TOPIC
T = topic_model.get_document_info(abstracts)
docs_per_topics = T.groupby(["Topic"]).apply(lambda x: x.index).to_dict()
print(docs_per_topics)

In [21]:
#LOAD
#=====
from bertopic import BERTopic
topic_model = BERTopic.load("Screening_Model_v16")

In [33]:
# PREDICT

# Import libraries
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Load data
new_abstract = pd.read_csv ("SA_20110401_Example_2.csv")

# Extract abstracts
new_abstract = new_abstract["Scientific_Abstract"].tolist()

# Convert data to strings
new_abstract = [str(doc) for doc in new_abstract]

# Predict topics for the new document
predicted_topics, predicted_probs = topic_model.transform([new_abstract])

# Print the topics for the new document
print(predicted_topics, predicted_probs, new_abstract)


[0] [0.82899109] ["BackgroundGood maternal & paternal health before and at conception can shape a child's future life course. This raises the importance of pre pregnancy care for screening, prevention & management of risk factors that affect pregnancy outcomes & the future health of families. There is little information about the provision of pre pregnancy care in England. Better understanding of the bio-psychosocial, cultural and economic factors affecting access to pre pregnancy care is needed if services are to be improved and more pregnancies planned. Only about 50% of pregnancies are planned. Holistic study of the complexity of health care before and between pregnancies is needed to identify interventions that are effective & acceptable to women & men, and the key contextual factors that enable health gain.AimsThe overall aim of the study is to provide high quality evidence regarding the implementation & public health impact of pre pregnancy health & care for women & men in Englan

In [31]:
# Get the possible topics and their probabilities
Topic_Predictions = predicted_topics, probabilities = topic_model.transform(new_abstract)
print (Topic_Predictions)

([0, 40], array([0.68483457, 0.58930002]))


In [32]:
# Get the most likely topic and its probability
predicted_topic = predicted_topics[probabilities.argmax()]
probability = probabilities[probabilities.argmax()]

# Print the most likely prediction and its probability
print("Most likely topic:", predicted_topic)
print("Probability:", probability)


Most likely topic: 0
Probability: 0.6848345749879021


In [None]:
#OPTIONAL PARAMETER EXAMPLES

#VECTORIZE
#=========
# vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
# umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=2, metric='euclidean', cluster_selection_method='eom')

# #EMBEDDINGS
# sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
# #TO DO: might want to try all-MiniLM-L6-v2 or paraphrase-multilingual-mpnet-base-v2
# embeddings = sentence_model.encode(documents)

# #TRAIN
# topic_model = BERTopic(
#     vectorizer_model=vectorizer_model,
#     embedding_model=sentence_model,
#     # umap_model=umap_model,
#     # hdbscan_model=hdbscan_model,
#     language='english', calculate_probabilities=True,
#     verbose=True
# )