In [None]:
%%capture
!pip install bertopic
#!pip install cohere
!pip install altair


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import requests
import re
from tabulate import tabulate
from matplotlib.pyplot import figure
import seaborn.objects as so
import scipy.stats
from sklearn.cluster import KMeans

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
#import cohere
from bertopic.representation import Cohere
from bertopic.backend import CohereBackend
import umap
import altair as alt
from sklearn.feature_extraction.text import CountVectorizer
from scipy.cluster import hierarchy

import bigframes.pandas as bpd
import tensorflow_hub
from bertopic.representation import KeyBERTInspired
from transformers.pipelines import pipeline
from sentence_transformers import SentenceTransformer

from sklearn.feature_extraction.text import CountVectorizer
import colorcet as cc

In [None]:
from google.cloud import storage
import pandas as pd
import io

# Set up the Google Cloud Storage client
client = storage.Client()

# Specify your bucket and file path
bucket = client.bucket("")  # Replace with your bucket name
blob = bucket.blob("")  # Replace with your file path

# Read the file directly into a pandas DataFrame without saving locally
content = blob.download_as_text()
com_id_df = pd.read_csv(io.StringIO(content))

# Display the first few rows of the DataFrame
com_id_df.head()

In [None]:
!pip install nltk
!pip install wordcloud matplotlib

In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

def download_nltk_resources():
    """
    Ensure that necessary NLTK resources are available.
    Download them only if they are not already downloaded.
    """
    resources = {
        'tokenizers/punkt': 'punkt',
        'corpora/wordnet': 'wordnet',
        'taggers/averaged_perceptron_tagger': 'averaged_perceptron_tagger'
    }

    for path, package in resources.items():
        try:
            nltk.data.find(path)
        except LookupError:
            nltk.download(package)

# Call the function to check and download resources
download_nltk_resources()

In [None]:
def remove_special_patterns(text):
    # This function can be adjusted if specific unwanted patterns are observed
    text = re.sub(r'\{[^ ]* ', ' ', text)
    text = re.sub(r'\\[^ ]* ', ' ', text)
    text = re.sub(r'Arial;}}[^ ]* ', ' ', text)
    text = re.sub(r';[^ ]* ', ' ', text)
    text = re.sub(r',\\[^ ]* ', ' ', text)
    return text

def clean_and_remove_rtf(text):
    # Remove RTF control words, formatting codes, and unnecessary curly braces content
    text = re.sub(r'\\[a-zA-Z]+\d* ?', '', text)  # Removes control words with optional numbers
    text = re.sub(r'\\[a-zA-Z]+', '', text)  # Cleans any remaining control words
    text = re.sub(r'{[^{}]*}', '', text)  # Aggressively remove content within curly braces
    text = re.sub(r'\btsWidth\d*\b', '', text)  # Specific removal of 'tsWidth' followed by any numbers
    text = re.sub(r'\bcl[a-zA-Z0-9]+\b', '', text)  # Removes words starting with 'cl' that are part of cell definitions
    text = re.sub(r'row[a-zA-Z0-9]+\b', '', text)  # Remove patterns starting with 'row' typical in table definitions
    text = re.sub(r'\brd[a-zA-Z0-9]+', '', text)  # Remove 'rd' prefixed RTF controls like 'rdrnone'
    text = re.sub(r'\b[a-zA-Z0-9]{1,3}\b', '', text)  # Remove isolated short words
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = re.sub(r'\s{2,}', ' ', text).strip()  # Normalize whitespace
    text = re.sub(r'\bArial\b', '', text, flags=re.IGNORECASE)  # Remove the name 'Arial' completely
    return text


def get_wordnet_pos(tag):
    """Map NLTK POS tag to a format recognized by WordNetLemmatizer"""
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag[0].upper(), wordnet.NOUN)


def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]
    return ' '.join(lemmatized_tokens)


def process_msg_txt(msg):
    """
    Process message texts such that they are standardized.
    """
    if isinstance(msg, str):
        msg = remove_special_patterns(msg)
        msg = clean_and_remove_rtf(msg)
        return lemmatize_text(msg) if msg else ""
    return ""

In [None]:
def filter_msg_txt(row):
    """
    Assign messages that meet criteria for a relevant message a value of 1,
    otherwise assign value of 0.

    Criteria:
      - msg is > 50 characters
      - subject isn't standard message for survey for recent video visit
    """
    if len(row["msg_txt_processed"]) <= 50:
        return 0
    elif row["msg_txt_processed"].startswith("We were unable to reach you by phone."):
        return 0
    else:
        return 1

def process_filter_msg_df(msg_df):
    """
    In: msg_df in the form of RQ_BC_from_pat_post_to_onc0000000000{0:0=2d}.csv
    Out: msg_df that has been processed and filtered
    """
    # Determine the correct column name for the text data
    if "msg_txt" in msg_df.columns:
        text_column = "msg_txt"
    elif "rtf_txt" in msg_df.columns:
        text_column = "rtf_txt"
    else:
        raise ValueError("Neither 'msg_txt' nor 'rtf_txt' columns found in DataFrame.")

    # Process the message text
    msg_df["msg_txt_processed"] = msg_df[text_column].apply(process_msg_txt)
    msg_df["msg_txt_flag"] = msg_df.apply(filter_msg_txt, axis=1)

    # Filter and remove duplicates
    msg_processed_filtered_df = msg_df[msg_df["msg_txt_flag"] == 1]
    msg_processed_filtered_df = msg_processed_filtered_df.drop_duplicates(subset=["msg_txt_processed"], keep="first")

    return msg_processed_filtered_df

def filter_by_id(msg_df, id_arr):
    """
    In: msg_df and array of IDs to filter by (i.e., ID arr of DM patients)
    Out: msg_df with only relevant patients
    """
    return msg_df.loc[msg_df["anon_id"].isin(id_arr)]


In [None]:
def get_diagnosis(row):
    """
    Helper function to change ICD-10 codes into diagnoses.
    """
    if row["icd10"].startswith("E08") or row["icd10"].startswith("E09"):
        return "DM"
    elif row["icd10"].startswith("F32"):
        return "depression"
    else:
        print(row["icd10"])
        raise Exception("Invalid ICD-10 code")


def get_diagnostic_grouping(diagnoses):
    """
    Helper function to change diagnoses into diagnostic groupings.
    """
    if "DM" in diagnoses and "depression" in diagnoses:
        return "co-morbid"
    elif "DM" in diagnoses:
        return "DM"
    elif "depression" in diagnoses:
        return "depression"
    else:
        raise Exception("Unexpected diagnosis grouping")


In [None]:
# Convert ICD codes into diagnoses
com_id_df["diagnosis"] = com_id_df.apply(lambda row: get_diagnosis(row), axis=1)

# Group by patient ID and aggregate diagnoses into lists
diagnosis_groups = com_id_df.groupby("anon_id")["diagnosis"].apply(list).reset_index()

# Convert aggregated diagnoses into diagnostic groupings
diagnosis_groups["Diagnostic Group"] = diagnosis_groups["diagnosis"].apply(get_diagnostic_grouping)

# Filter to keep only those with "co-morbid" diagnosis
co_morbid_df = diagnosis_groups[diagnosis_groups["Diagnostic Group"] == "co-morbid"]

# Merge the diagnostic grouping back to the original DataFrame
com_id_df = com_id_df.merge(diagnosis_groups[["anon_id", "Diagnostic Group"]], on="anon_id", how="left")

# Get valid IDs for "co-morbid" group only
final_valid_ids = co_morbid_df["anon_id"].values

# If you want to see the result
print(co_morbid_df.shape)
print(final_valid_ids)



In [None]:
# Run pipeline for all 7 subsets of the dataset
for num in range(0, 7):

    # Generate the correct file name
    file_name = "Your_file_name_0000000000{0:0=2d}.csv".format(num)

    # Access the file in Google Cloud Storage
    blob = bucket.blob(file_name)

    # Download the content as a string
    content = blob.download_as_text()

    # Read the string content into a pandas DataFrame
    temp_msg_all_df = pd.read_csv(io.StringIO(content))

    # Process and filter the DataFrame
    temp_msg_all_df = process_filter_msg_df(temp_msg_all_df)
    temp_msg_com_df = filter_by_id(temp_msg_all_df, final_valid_ids)

    # Concatenate the DataFrames
    if num == 0:
        msg_com_all_df = temp_msg_com_df
    else:
        msg_com_all_df = pd.concat([msg_com_all_df, temp_msg_com_df])

    # Drop any new duplicates that have arisen across data subsets
    msg_com_all_df = msg_com_all_df.drop_duplicates(subset=["msg_txt_processed"], keep="first")

    # Print the shape of the DataFrame after processing each subset
    print(msg_com_all_df.shape)


In [None]:
print(msg_com_all_df.shape)
print(msg_com_all_df.columns.values)

In [None]:
# Add ICD-10 diagnosis and diagnostic group to the dataframe of messages
msg_com_all_df = msg_com_all_df.merge(com_id_df[["anon_id", "diagnosis", "Diagnostic Group"]], left_on="anon_id", right_on="anon_id")

# Check value counts for diagnosis
print(msg_com_all_df["diagnosis"].value_counts())


In [None]:
# Create column to flag columns that meet criteria
msg_com_all_df["msg_txt_processed"] = msg_com_all_df.apply(lambda row: process_msg_txt(row["msg_txt"]), axis=1)
msg_com_all_df["msg_txt_flag"] = msg_com_all_df.apply(lambda row: filter_msg_txt(row), axis=1)

# Create dataframe containing only messages that meet processed criteria
msg_processed_df = msg_com_all_df[msg_com_all_df["msg_txt_flag"] == 1]

# Drop rows with duplicate messages
msg_processed_df = msg_processed_df.drop_duplicates(subset=["msg_txt_processed"], keep="first")

In [None]:
msg_com_all_df.shape

### **1st Topic Model**

In [None]:
# Create list of messages for formatting into model
short_docs = msg_processed_df["msg_txt_processed"].values.tolist()

In [None]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(short_docs, show_progress_bar=True)

In [None]:
from sklearn.cluster import Birch
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

# Define your stop words and zero-shot topic list
zeroshot_common_topic_list = ["depression"]
stop_words_list = ['dr', 'doctor', 'june', 'july', 'myhealth']

vectorizer_model = CountVectorizer(stop_words=stop_words_list)

# Define your BERTopic model
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=15,
    zeroshot_topic_list=zeroshot_common_topic_list,
    zeroshot_min_similarity=.82,
    representation_model=KeyBERTInspired()
)

# Fit your BERTopic model and transform documents to get topics and embeddings
topics, _ = topic_model.fit_transform(short_docs, embeddings)

In [None]:
# Get topic summary
topic_info_df = topic_model.get_topic_info()

In [None]:
# View topic summary
topic_info_df.head(40)

In [None]:
# Convert numpy array to pandas DataFrame
df_topic_info_df = pd.DataFrame(topic_info_df)

# Now save the DataFrame to a CSV file
csv_data = df_topic_info_df.to_csv(index=False)

In [None]:
# Specify the file path and name in the bucket
file_name = ""

# Create a blob (a file in GCS)
blob = bucket.blob(file_name)

# Upload the CSV string to GCS
blob.upload_from_string(csv_data, content_type='text/csv')


### **2nd Topic Model**

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import Birch
import numpy as np

# Load your embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Now, use BIRCH for clustering
birch_model = Birch(threshold=0.5, n_clusters=None)  # Adjust threshold and n_clusters as needed
birch_model.fit(embeddings)

# Predict clusters using BIRCH
birch_clusters = birch_model.predict(embeddings)


# Create a mapping from BIRCH clusters to documents
cluster_to_docs = {}
for doc_idx, cluster in enumerate(birch_clusters):
    if cluster not in cluster_to_docs:
        cluster_to_docs[cluster] = []
    cluster_to_docs[cluster].append(doc_idx)

# Combine texts by BIRCH cluster
df = pd.DataFrame({
    'Document': short_docs,
    'BIRCH_Cluster': birch_clusters
})
cluster_texts = df.groupby('BIRCH_Cluster')['Document'].apply(lambda docs: ' '.join(docs)).reset_index()

# Create a new BERTopic model with custom topics
new_topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=15,
    zeroshot_topic_list=zeroshot_common_topic_list,
    zeroshot_min_similarity=.82,
    representation_model=KeyBERTInspired()
)

# Fit the new BERTopic model on the combined cluster texts
new_topics, _ = new_topic_model.fit_transform(cluster_texts['Document'].tolist())

# Get topic summaries for each cluster
topic_info = new_topic_model.get_topic_info()

In [None]:
# View topic summary
topic_info.head(40)

In [None]:
# Convert numpy array to pandas DataFrame
df_topic_info = pd.DataFrame(topic_info)

# Now save the DataFrame to a CSV file
csv_data = df_topic_info.to_csv(index=False)

In [None]:
# Specify the file path and name in the bucket
file_name = ""

# Create a blob (a file in GCS)
blob = bucket.blob(file_name)

# Upload the CSV string to GCS
blob.upload_from_string(csv_data, content_type='text/csv')