### Part 1. Data loading and environment setting

In [None]:
%%capture
!pip install bertopic
#!pip install cohere
!pip install altair

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import requests
import re
from tabulate import tabulate
from matplotlib.pyplot import figure
import seaborn.objects as so
import scipy.stats
from sklearn.cluster import KMeans

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
#import cohere
from bertopic.representation import Cohere
from bertopic.backend import CohereBackend
import umap
import altair as alt
from sklearn.feature_extraction.text import CountVectorizer
from scipy.cluster import hierarchy

import bigframes.pandas as bpd
import tensorflow_hub
from bertopic.representation import KeyBERTInspired
from transformers.pipelines import pipeline
from sentence_transformers import SentenceTransformer

from sklearn.feature_extraction.text import CountVectorizer
import colorcet as cc

In [None]:
# additional
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors


# stopwords
import nltk
nltk.download(['wordnet', 'stopwords', 'punkt'])
nltk.download('vader_lexicon')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# load NLTK's list of English stopwords
stop_words = set(stopwords.words('english'))

### Part 2. Loading message data

In [None]:
!pip install nltk
!pip install wordcloud matplotlib

In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
nltk.download('punkt_tab')

def download_nltk_resources():
    """
    Ensure that necessary NLTK resources are available.
    Download them only if they are not already downloaded.
    """
    resources = {
        'tokenizers/punkt': 'punkt',
        'corpora/wordnet': 'wordnet',
        'taggers/averaged_perceptron_tagger': 'averaged_perceptron_tagger'
    }

    for path, package in resources.items():
        try:
            nltk.data.find(path)
        except LookupError:
            nltk.download(package)

# Call the function to check and download resources
download_nltk_resources()


In [None]:
# Removing special characters is optional. We did this because some messages were saved in a rft format.

In [None]:
def remove_special_patterns(text):
    # This function can be adjusted if specific unwanted patterns are observed
    text = re.sub(r'\{[^ ]* ', ' ', text)
    text = re.sub(r'\\[^ ]* ', ' ', text)
    text = re.sub(r'Arial;}}[^ ]* ', ' ', text)
    text = re.sub(r';[^ ]* ', ' ', text)
    text = re.sub(r',\\[^ ]* ', ' ', text)
    return text

def clean_and_remove_rtf(text):
    # Remove RTF control words, formatting codes, and unnecessary curly braces content
    text = re.sub(r'\\[a-zA-Z]+\d* ?', '', text)  # Removes control words with optional numbers
    text = re.sub(r'\\[a-zA-Z]+', '', text)  # Cleans any remaining control words
    text = re.sub(r'{[^{}]*}', '', text)  # Aggressively remove content within curly braces
    text = re.sub(r'\btsWidth\d*\b', '', text)  # Specific removal of 'tsWidth' followed by any numbers
    text = re.sub(r'\bcl[a-zA-Z0-9]+\b', '', text)  # Removes words starting with 'cl' that are part of cell definitions
    text = re.sub(r'row[a-zA-Z0-9]+\b', '', text)  # Remove patterns starting with 'row' typical in table definitions
    text = re.sub(r'\brd[a-zA-Z0-9]+', '', text)  # Remove 'rd' prefixed RTF controls like 'rdrnone'
    text = re.sub(r'\b[a-zA-Z0-9]{1,3}\b', '', text)  # Remove isolated short words
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = re.sub(r'\s{2,}', ' ', text).strip()  # Normalize whitespace
    text = re.sub(r'\bArial\b', '', text, flags=re.IGNORECASE)  # Remove the name 'Arial' completely
    return text


def get_wordnet_pos(tag):
    """Map NLTK POS tag to a format recognized by WordNetLemmatizer"""
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag[0].upper(), wordnet.NOUN)


def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]
    return ' '.join(lemmatized_tokens)


def process_msg_txt(msg):
    """
    Process message texts such that they are standardized.
    """
    if isinstance(msg, str):
        msg = remove_special_patterns(msg)
        msg = clean_and_remove_rtf(msg)
        return lemmatize_text(msg) if msg else ""
    return ""

In [None]:
def filter_msg_txt(row):
    """
    Assign messages that meet criteria for a relevant message a value of 1,
    otherwise assign value of 0.

    Criteria:
      - msg is > 50 characters
      - subject isn't standard message for survey for recent video visit
    """
    if len(row["msg_txt_processed"]) <= 50:
        return 0
    elif row["msg_txt_processed"].startswith("We were unable to reach you by phone."):
        return 0
    else:
        return 1

def process_filter_msg_df(msg_df):
    """
    In: msg_df in the form of ABC_0000000000{0:0=2d}.csv
    Out: msg_df that has been processed and filtered
    """
    msg_df["msg_txt_processed"] = msg_df["msg_txt"].apply(process_msg_txt)
    msg_df["msg_txt_flag"] = msg_df.apply(filter_msg_txt, axis=1)
    msg_processed_filtered_df = msg_df[msg_df["msg_txt_flag"] == 1]
    msg_processed_filtered_df = msg_processed_filtered_df.drop_duplicates(subset=["msg_txt_processed"], keep="first")
    return msg_processed_filtered_df



def filter_by_id(msg_df, id_arr):
    """
    In: msg_df and array of IDs to filter by (i.e., ID arr of patients)
    Out: msg_df with only relevant patients
    """
    return msg_df.loc[msg_df["anon_id"].isin(id_arr)]


In [None]:
import pandas as pd

def get_diagnosis(row):
  """
  Helper function to change ICD-10 codes into diagnoses.
  """

  if row["icd10"].startswith("C43"):
    return "Melanoma"
  elif row["icd10"].startswith("C44"):
    return "Other malignant of skin"
  elif row["icd10"].startswith("C46"):
    return "Kaposi Sarcoma"
  else:
    print(row["icd10"])
    raise Exception("Invalid ICD-10 code")

def get_diagnostic_grouping(row):
  """
  Helper function to change diagnoses into diagnostic groupings.
  """

  if row["diagnosis"] in ["Melanoma", "Other malignant of skin", "Kaposi Sarcoma"]:
    return "Skin cancer"

  else:
    print(row["diagnosis"])
    raise Exception("Invalid Diagnosis")

In [None]:
# retreiving anonimous_id of those with skin cancer ICD-10 codes

from google.cloud import storage
import pandas as pd
import os

# Configure your GCS bucket and file
bucket_name = "Your-bucket-name"  # Replace with your GCS bucket name
file_path = "ABC.csv"  # Replace with your file's path in the bucket
local_file_path = "path/ABC.csv"  # Local file path to save the downloaded file

# Ensure the local directory exists
local_directory = os.path.dirname(local_file_path)
os.makedirs(local_directory, exist_ok=True)

# Download file from GCS
try:
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(file_path)
    blob.download_to_filename(local_file_path)  # Save locally
    print(f"File downloaded to {local_file_path}.")
except Exception as e:
    print(f"An error occurred: {e}")

# Load the dataset
try:
    com_id_df = pd.read_csv(local_file_path)
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Failed to load dataset: {e}")

In [None]:

# Convert ICD codes into diagnoses
com_id_df["diagnosis"] = com_id_df.apply(lambda row: get_diagnosis(row), axis=1)

# Convert diagnoses into diagnostic groupings
com_id_df["Diagnostic Group"] = com_id_df.apply(lambda row: get_diagnostic_grouping(row), axis=1)

In [None]:
# If patient ID with same diagnosis is listed multiple times,
# only keep one entry.
com_id_df = com_id_df.drop_duplicates(subset=["anon_id", "diagnosis"], keep="first")
print(com_id_df.shape)

# If patient ID exists for 2+ diagnoses, drop the patient ID.
com_id_df = com_id_df.drop_duplicates(subset=["anon_id"], keep=False)
print(com_id_df.shape)

# Get valid IDs
final_valid_ids = com_id_df["anon_id"].values
print(final_valid_ids)

In [None]:
# loading the message data from CGP bucket

import pandas as pd
from io import StringIO  # Import StringIO from io module
from google.cloud import storage
from google.api_core.exceptions import NotFound

# Initialize Google Cloud Storage client
client = storage.Client()
bucket_name = 'Your-bucket-name'
bucket = client.get_bucket(bucket_name)

# Define a function to download CSV file and handle errors
def download_blob(file_name):
    try:
        blob = bucket.blob(file_name)
        content = blob.download_as_text()  # Download content as a string
        return pd.read_csv(StringIO(content))  # Read into DataFrame
    except NotFound:
        print(f"Error: The file '{file_name}' does not exist in the bucket '{bucket_name}'.")
        return pd.DataFrame()  # Return an empty DataFrame if the file is not found
    except Exception as e:
        print(f"An error occurred while downloading '{file_name}': {e}")
        return pd.DataFrame()

# Empty DataFrame for concatenating results
msg_com_all_df = pd.DataFrame()

# Run pipeline for all 15 subsets of the dataset
for num in range(15):  # 0 to 14
    # Construct the file name based on the loop index
    file_name = f"ABC0000000000{num:02}.csv"

    # Download the DataFrame from the bucket
    temp_msg_all_df = download_blob(file_name)

    if temp_msg_all_df.empty:
        print(f"No data to process for {file_name}. Skipping...")
        continue  # Skip to the next file if the DataFrame is empty

    # Perform the processing on the DataFrame
    temp_msg_all_df = process_filter_msg_df(temp_msg_all_df)
    temp_msg_com_df = filter_by_id(temp_msg_all_df, final_valid_ids)

    # Concatenate the processed DataFrame
    msg_com_all_df = pd.concat([msg_com_all_df, temp_msg_com_df], ignore_index=True)

# Print the shape of the DataFrame
    print(f'Processed shape after loading {file_name}: {msg_com_all_df.shape}')

# Final DataFrame is now in msg_com_all_df

In [None]:
# Add ICD-10 diagnosis and diagnostic group to the dataframe of messages
msg_com_all_df = msg_com_all_df.merge(com_id_df[["anon_id", "diagnosis", "Diagnostic Group"]], left_on="anon_id", right_on="anon_id")
msg_com_all_df["diagnosis"].value_counts()

In [None]:
print(msg_com_all_df.shape)
print(msg_com_all_df.columns.values)

In [None]:
#check the first 5 lines of data
msg_com_all_df.head(5)

In [None]:
# Create column to flag columns that meet criteria
msg_com_all_df["msg_txt_processed"] = msg_com_all_df.apply(lambda row: process_msg_txt(row["msg_txt"]), axis=1)
msg_com_all_df["msg_txt_flag"] = msg_com_all_df.apply(lambda row: filter_msg_txt(row), axis=1)

# Create dataframe containing only messages that meet processed criteria
msg_processed_df = msg_com_all_df[msg_com_all_df["msg_txt_flag"] == 1]

# Drop rows with duplicate messages
msg_processed_df = msg_processed_df.drop_duplicates(subset=["msg_txt_processed"], keep="first")

In [None]:
msg_com_all_df.shape

In [None]:
# Create list of messages for formatting into model
short_docs = msg_processed_df["msg_txt_processed"].values.tolist()

In [None]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(short_docs, show_progress_bar=True)

In [None]:
# Save embeddings to a CSV file so we don't have to re-run everytime
embeddings_df = pd.DataFrame(embeddings)
embeddings_file_path = "embeddings.csv"
embeddings_df.to_csv(embeddings_file_path, index=False)

def upload_to_bucket(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""

    # Initialize a storage client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Create a blob object from the bucket
    blob = bucket.blob(destination_blob_name)

    # Upload the file
    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

# Define your bucket name
bucket_name = "Your-bucket-name"

# File paths
source_file_name = embeddings_file_path  # Use the path of the generated embeddings file
destination_blob_name = "path/emb_ABC.csv"  # Change the path if needed

# Upload the file
upload_to_bucket(bucket_name, source_file_name, destination_blob_name)

In [None]:
# Re-load the saved embeddings

import numpy as np
import pandas as pd
from google.cloud import storage

def download_from_bucket(bucket_name, source_blob_name, destination_file_name):
    """Downloads a file from the bucket."""

    # Initialize a storage client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Create a blob object from the bucket
    blob = bucket.blob(source_blob_name)

    # Download the file
    blob.download_to_filename(destination_file_name)

    print(f"File {source_blob_name} downloaded to {destination_file_name}.")

# Define your bucket name and file paths
bucket_name = "Your-bucket-name"
source_blob_name = "path/emb_ABC.csv"  # The path you uploaded to
destination_file_name = "loaded_embeddings.csv"  # Local path where you want to save the downloaded file

# Download the file from the bucket
download_from_bucket(bucket_name, source_blob_name, destination_file_name)

# Load embeddings from the downloaded CSV file
loaded_embeddings_df = pd.read_csv(destination_file_name)

# Convert DataFrame back to a NumPy array if needed
loaded_embeddings = loaded_embeddings_df.to_numpy()

# Verify the shape of the loaded embeddings
print(f"Loaded embeddings shape: {loaded_embeddings.shape}")

### Part 3. a. Topic model (1st BERTopic model)###

In [None]:
from sklearn.cluster import Birch
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

# Define your stop words and zero-shot topic list
zeroshot_common_topic_list = ["cancer"]
stop_words_list = ['Your-custom-stopwords-here']

vectorizer_model = CountVectorizer(stop_words=stop_words_list)

# Define your BERTopic model
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=15,
    zeroshot_topic_list=zeroshot_common_topic_list,
    zeroshot_min_similarity=.82,
    representation_model=KeyBERTInspired()
)

# Fit your BERTopic model and transform documents to get topics and embeddings
topics, _ = topic_model.fit_transform(short_docs, embeddings)



In [None]:
# Get topic summary
topic_info_df = topic_model.get_topic_info()

In [None]:
# View topic summary
topic_info_df.head(20)

In [None]:
# Save topic summary to a CSV file

topic_info_df = pd.DataFrame(topic_info_df)
topic_info_df_file_path = "summary_ABC.csv"
topic_info_df.to_csv(topic_info_df_file_path, index=False)

def upload_to_bucket(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""

    # Initialize a storage client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Create a blob object from the bucket
    blob = bucket.blob(destination_blob_name)

    # Upload the file
    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

# Define your bucket name
bucket_name = "Your-bucket-name"

# File paths
source_file_name = topic_info_df_file_path  # Use the path of the generated embeddings file
destination_blob_name = "path/summary_ABC.csv"

# Upload the file
upload_to_bucket(bucket_name, source_file_name, destination_blob_name)

### Part 3. b. BIRCH Topic model (2nd BERTopic model) ###

In [None]:
# Optionally, you can create a new BERTopic model using the BIRCH clusters as topics

from sklearn.cluster import Birch
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
import numpy as np


# Now, use BIRCH for clustering
birch_model = Birch(threshold=0.5, n_clusters=None)  # Adjust threshold and n_clusters as needed
birch_model.fit(embeddings)

# Predict clusters using BIRCH
birch_clusters = birch_model.predict(embeddings)


# Create a mapping from BIRCH clusters to documents
cluster_to_docs = {}
for doc_idx, cluster in enumerate(birch_clusters):
    if cluster not in cluster_to_docs:
        cluster_to_docs[cluster] = []
    cluster_to_docs[cluster].append(doc_idx)

# Create a custom topic model with BIRCH clusters
custom_topics = []
custom_topic_words = []
custom_topic_sizes = []

for cluster, doc_indices in cluster_to_docs.items():
    custom_topics.append(doc_indices)
    words = topic_model.get_topic(cluster)  # Get words for the cluster topic
    custom_topic_words.append(words)
    custom_topic_sizes.append(len(doc_indices))

# Create a new BERTopic model with custom topics
new_topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=15,
    zeroshot_topic_list=zeroshot_common_topic_list,
    zeroshot_min_similarity=.82,
    representation_model=KeyBERTInspired()
)

In [None]:
import pandas as pd

# Assuming short_docs is your list of document texts
df = pd.DataFrame({
    'Document': short_docs,
    'BIRCH_Cluster': birch_clusters
})

In [None]:
# Combine texts by BIRCH cluster
cluster_texts = df.groupby('BIRCH_Cluster')['Document'].apply(lambda docs: ' '.join(docs)).reset_index()


In [None]:
# Fit the new BERTopic model on the combined cluster texts
new_topics, _ = new_topic_model.fit_transform(cluster_texts['Document'].tolist())

# Get topic summaries for each cluster
topic_info = new_topic_model.get_topic_info()

In [None]:
# View topic summary
topic_info.head(20)

In [None]:
# Save topic summary to a CSV file

topic_info = pd.DataFrame(topic_info)
topic_info_file_path = "summary_BIRCH_ABC.csv"
topic_info.to_csv(topic_info_file_path, index=False)

def upload_to_bucket(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""

    # Initialize a storage client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Create a blob object from the bucket
    blob = bucket.blob(destination_blob_name)

    # Upload the file
    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

# Define your bucket name
bucket_name = "Your-bucket-name"

# File paths
source_file_name = topic_info_file_path  # Use the path of the generated embeddings file
destination_blob_name = "path/summary_BIRCH_ABC.csv"

# Upload the file
upload_to_bucket(bucket_name, source_file_name, destination_blob_name)


### Part 3. c. BIRCH BERTopic modeling with reduced embedding using PCA  ###

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import Birch
import numpy as np

# Reduce dimensions of embeddings
pca = PCA(n_components=200)  # Adjust the number of components as needed (we tested 180-220)
reduced_embeddings = pca.fit_transform(embeddings)

# Initialize BIRCH with partial fitting
birch_model = Birch(threshold=0.5, n_clusters=None)

# Fit BIRCH model incrementally
for i in range(0, len(reduced_embeddings), 1000):  # Adjust batch size as needed
    batch = reduced_embeddings[i:i + 1000]
    birch_model.partial_fit(batch)

# Predict clusters using BIRCH
birch_clusters = birch_model.predict(reduced_embeddings)

# Create a mapping from BIRCH clusters to documents
cluster_to_docs = {}
for doc_idx, cluster in enumerate(birch_clusters):
    if cluster not in cluster_to_docs:
        cluster_to_docs[cluster] = []
    cluster_to_docs[cluster].append(doc_idx)

# Combine texts by BIRCH cluster
df = pd.DataFrame({
    'Document': short_docs,
    'BIRCH_Cluster': birch_clusters
})
cluster_texts = df.groupby('BIRCH_Cluster')['Document'].apply(lambda docs: ' '.join(docs)).reset_index()

# Create a new BERTopic model with custom topics
new_topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=15,
    zeroshot_topic_list=zeroshot_common_topic_list,
    zeroshot_min_similarity=.82,
    representation_model=KeyBERTInspired()
)

# Fit the new BERTopic model on the combined cluster texts
new_topics, _ = new_topic_model.fit_transform(cluster_texts['Document'].tolist())

# Get topic summaries for each cluster
topic_info = new_topic_model.get_topic_info()



In [None]:
# View topic summary (BIRCH BERTopic with PCA)
topic_info.head(30)

In [None]:
# Calculate cumulative explained variance
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Find the number of components that explain at least 90% variance
n_components_90 = np.where(cumulative_explained_variance >= 0.90)[0][0] + 1

# Print the number of components
print(f"Number of components explaining 90% variance: {n_components_90}")


In [None]:
# Save topic summary to a CSV file

topic_info = pd.DataFrame(topic_info)
topic_info_file_path = "summary_BIRCH_PCA_ABC.csv"
topic_info.to_csv(topic_info_file_path, index=False)

def upload_to_bucket(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""

    # Initialize a storage client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Create a blob object from the bucket
    blob = bucket.blob(destination_blob_name)

    # Upload the file
    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

# Define your bucket name
bucket_name = "Your-bucket-name"

# File paths
source_file_name = topic_info_file_path  # Use the path of the generated embeddings file
destination_blob_name = "path/summary_BIRCH_PCA_ABC.csv"

# Upload the file
upload_to_bucket(bucket_name, source_file_name, destination_blob_name)

### Correlation test

In [None]:
import pandas as pd
from scipy import stats
import numpy as np

# Sample data
data = {
    'ID': list(range(1, 31)),
    'search': [1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0],
    'score': [3.67, 2.67, 3.00, 3.00, 3.33, 2.67, 3.67, 2.33, 2.33, 5.00, 4.33, 3.00, 3.33, 4.00, 3.00, 3.67, 4.33, 3.67, 4.00, 2.67, 3.33, 3.00, 2.67, 3.33, 3.00, 2.33, 2.67, 3.33, 1.67, 2.67]
}

# Create DataFrame
df = pd.DataFrame(data)

# Perform Spearman correlation test
correlation, p_value = stats.spearmanr(df['search'], df['score'])

# Function to calculate 95% CI using bootstrapping
def bootstrap_ci(data1, data2, num_samples=1000, alpha=0.05):
    correlations = []
    for _ in range(num_samples):
        sample1 = np.random.choice(data1, size=len(data1), replace=True)
        sample2 = np.random.choice(data2, size=len(data2), replace=True)
        corr, _ = stats.spearmanr(sample1, sample2)
        correlations.append(corr)
    lower_bound = np.percentile(correlations, 100 * alpha / 2)
    upper_bound = np.percentile(correlations, 100 * (1 - alpha / 2))
    return lower_bound, upper_bound

# Calculate 95% CI using bootstrap
ci_lower, ci_upper = bootstrap_ci(df['search'], df['score'])

# Print results
print(f"Spearman correlation coefficient: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"95% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")


Spearman correlation coefficient: 0.2849
P-value: 0.1270
95% Confidence Interval: [-0.3713, 0.3594]
