### Loading the data

In [None]:
from google.cloud import storage
import pandas as pd

# Configure your GCS bucket and file
bucket_name = ""  # Replace with your GCS bucket name
file_path = ""  # Replace with your file's path in the bucket

# Download file from GCS
client = storage.Client()
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(file_path)
blob.download_to_filename("")  # Save locally

# Load the dataset
data = pd.read_csv("")

### Screening simulation with threshold =1

In [None]:
# Llama 3.1 8B

In [None]:
import pandas as pd
import re
import time
from typing import Tuple, Dict

# Define a function to call the Llama model
def llama_model_fn(text: str) -> Tuple[int, str]:
    try:
        system_instruction = "Message is a conversation between a patient and a healthcare professional"
        full_prompt = (
            f"{system_instruction}\n"
            f"Classify the following message as 1 if the patient shows depressive symptoms at least once,"
              "otherwise 0, and provide the reasoning.\n"
            f"Output format:\n"
            f"Classification: <0 or 1>\n"
            f"Reason: <reason_text>\n\n"
            f"Message:\n{text}"
        )

        response = client.chat.completions.create(
            model=MODEL_ID,
            messages=[{"role": "user", "content": full_prompt}],
        )


        content = response.choices[0].message.content.strip()
        print(f"Extracted content: {content}")  # Debug: Print extracted content

        # Initialize default values
        classification = 0
        reasoning = "No reason provided."

        # Parse the response
        parts = content.split('\n')
        for part in parts:
            if "Classification:" in part:
                try:
                    classification = int(part.split(":", 1)[1].strip())
                except ValueError:
                    classification = 0  # Default to 0 in case of error
            elif "Reason:" in part:
                reasoning = part.split(":", 1)[1].strip()

        print(f"Parsed values -> Classification: {classification}, Reason: {reasoning}")
        return classification, reasoning

    except Exception as e:
        print(f"Error processing message: {e}")
        return 0, "Error processing message."

# Initialize lists to store classifications and reasons
classifications = []
reasons = []

# Process each entry in the dataset
for index, row in data.iterrows():
    message = row.get("combined_msg_txt", "")  # Safely get 'msg' column
    if not message:
        print(f"Row {index} has no message. Skipping.")
        classifications.append(0)
        reasons.append("No message provided.")
        continue

    classification, reasoning = llama_model_fn(message)
    classifications.append(classification)
    reasons.append(reasoning)
    time.sleep(1)  # Sleep for 1 second to avoid too many requests

    # Optional: Print progress
    if (index + 1) % 10 == 0 or (index + 1) == len(data):
        print(f"Processed {index + 1}/{len(data)} messages.")

# Add classifications and reasons to the dataframe
data["predictions"] = classifications
data["reasoning"] = reasons

# Handle cases where classification was None or NaN
data["predictions"].fillna(0, inplace=True)

# Save the results to a CSV for review
output_file = ""
data.to_csv(output_file, index=False)
print(f"Classifications saved to '{output_file}'.")
print(data)


In [None]:
saved_data=pd.read_csv("")

In [None]:
from google.cloud import storage

def upload_to_bucket(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""

    # Initialize a storage client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Create a blob object from the bucket
    blob = bucket.blob(destination_blob_name)

    # Upload the file
    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")


# Define your bucket name
bucket_name = ""

# File paths
source_file_name = ""
destination_blob_name = ""  # Change the path if needed

# Upload the file
upload_to_bucket(bucket_name, source_file_name, destination_blob_name)

### Typical time to diagnose from CVD dx to DEP dx

In [None]:
import pandas as pd

# Load the data
df_1 = pd.read_csv("")

# Convert 'created_time_jittered' and 'start_date_jittered' to datetime
df_1['cvd_start_date_jittered'] = pd.to_datetime(df_1['cvd_start_date_jittered'], format='%Y-%m-%d %H:%M:%S')
df_1['dep_start_date_jittered'] = pd.to_datetime(df_1['dep_start_date_jittered'], format='%Y-%m-%d %H:%M:%S')


# Calculate the time difference
df_1['difference'] = df_1['dep_start_date_jittered'] - df_1['cvd_start_date_jittered']

# Convert 'difference' column to Timedelta
df_1['difference'] = pd.to_timedelta(df_1['difference'])

# Convert 'difference' column to total hours
df_1['difference_hours'] = df_1['difference'].dt.total_seconds() / 3600

# Calculate the average of the 'difference_hours' column
average_difference_hours = df_1['difference_hours'].mean()

# Convert the average back to Timedelta in hours
average_difference = pd.to_timedelta(average_difference_hours, unit='h')

# Add the average as a new column named 'average'
df_1['average'] = average_difference

data.to_csv("", index=False)
# Display the updated DataFrame
print(df_1)


# Display the DataFrame
print(df_1)


### average number of messages per patient

In [None]:

import pandas as pd

# Load the data
df_1 = pd.read_csv("")


# Calculate the average of the 'message_id_count' column
msg_freq_ave = df_1['message_id_count'].mean()


# Display the DataFrame
print(msg_freq_ave)


### accuracy computation (th=1)

In [None]:
# CASE

In [None]:
from google.cloud import storage
import pandas as pd

# Initialize the GCS client
client = storage.Client()

# List of file paths in GCS and local paths
bucket_name = ""
gcs_file_paths = [f'path/_{i}.csv' for i in range(1, 6)]
local_file_paths = [f"file{i}.csv" for i in range(1, 6)]

# Download files from GCS
bucket = client.get_bucket(bucket_name)
for gcs_path, local_path in zip(gcs_file_paths, local_file_paths):
    bucket.blob(gcs_path).download_to_filename(local_path)

# Read and concatenate all datasets
dataframes = [pd.read_csv(file) for file in local_file_paths]
case_df = pd.concat(dataframes, ignore_index=True)

# Add a new column 'label' and set all values to 1
case_df['label'] = 1

# Save the combined dataframe to a new file
case_df.to_csv("", index=False)

print("Datasets combined and label column added successfully!")

In [None]:
# CONTROL

In [None]:
from google.cloud import storage
import pandas as pd

# Initialize the GCS client
client = storage.Client()

# List of file paths in GCS and local paths
bucket_name = ""
gcs_file_paths = [f'path/_{i}.csv' for i in range(1, 6)]
local_file_paths = [f"file{i}.csv" for i in range(1, 6)]

# Download files from GCS
bucket = client.get_bucket(bucket_name)
for gcs_path, local_path in zip(gcs_file_paths, local_file_paths):
    bucket.blob(gcs_path).download_to_filename(local_path)

# Read and concatenate all datasets
dataframes = [pd.read_csv(file) for file in local_file_paths]
control_df = pd.concat(dataframes, ignore_index=True)

# Add a new column 'label' and set all values to 0
control_df['label'] = 0

# Save the combined dataframe to a new file
control_df.to_csv("", index=False)

print("Datasets combined and label column added successfully!")

In [None]:
# combined case and control (when, th=1)

import pandas as pd

# File paths for the two datasets
file1 = ""
file2 = ""

# Read the datasets into pandas DataFrames
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# Concatenate the two DataFrames
comb_df = pd.concat([df1, df2], ignore_index=True)


# Save the combined DataFrame to a new file
comb_df.to_csv("", index=False)

print("Files concatenated and label column added successfully!")

In [None]:
# upload the file in the bucket

from google.cloud import storage

def upload_to_bucket(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""

    # Initialize a storage client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Create a blob object from the bucket
    blob = bucket.blob(destination_blob_name)

    # Upload the file
    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")


# Define your bucket name
bucket_name = ""

# File paths
source_file_name = ""
destination_blob_name = ""  # Change the path if needed

# Upload the file
upload_to_bucket(bucket_name, source_file_name, destination_blob_name)

In [None]:
# classification test

In [None]:
# WITH 95% CI

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Load the data from the CSV file
df = pd.read_csv("")

# Extract the 'label' and 'classification' columns
y_true = df['label']
y_pred = df['predictions']

def bootstrap_confidence_interval(y_true, y_pred, metric_func, n_bootstraps=1000, ci=95, **kwargs):
    """
    Calculates the confidence interval for a given metric using bootstrapping.

    Parameters:
        y_true (pd.Series): True labels.
        y_pred (pd.Series): Predicted labels.
        metric_func (function): Scikit-learn metric function to calculate (e.g., f1_score).
        n_bootstraps (int): Number of bootstrap samples.
        ci (float): Confidence level (e.g., 95 for 95% CI).
        **kwargs: Additional keyword arguments for the metric function.

    Returns:
        tuple: Lower and upper bounds of the confidence interval.
    """
    boot_scores = []
    n = len(y_true)

    for _ in range(n_bootstraps):
        # Sample with replacement
        indices = np.random.randint(0, n, n)
        y_true_boot = y_true.iloc[indices]
        y_pred_boot = y_pred.iloc[indices]

        # Handle cases where metric might fail (e.g., no positive predictions)
        try:
            score = metric_func(y_true_boot, y_pred_boot, **kwargs)
            boot_scores.append(score)
        except ValueError:
            continue  # Skip this bootstrap sample if metric calculation fails

    # Calculate percentiles for the confidence interval
    lower = np.percentile(boot_scores, (100 - ci) / 2)
    upper = np.percentile(boot_scores, 100 - (100 - ci) / 2)
    return lower, upper

# Calculate the point estimates for the metrics
f1 = f1_score(y_true, y_pred, average='binary')  # Adjust 'average' as needed
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
accuracy = accuracy_score(y_true, y_pred)

# Calculate the 95% confidence intervals using bootstrapping
f1_ci = bootstrap_confidence_interval(y_true, y_pred, f1_score, average='binary')
precision_ci = bootstrap_confidence_interval(y_true, y_pred, precision_score, average='binary')
recall_ci = bootstrap_confidence_interval(y_true, y_pred, recall_score, average='binary')
accuracy_ci = bootstrap_confidence_interval(y_true, y_pred, accuracy_score)

# Print the results
print(f'F1 Score: {f1:.4f} (95% CI: {f1_ci[0]:.4f} - {f1_ci[1]:.4f})')
print(f'Precision: {precision:.4f} (95% CI: {precision_ci[0]:.4f} - {precision_ci[1]:.4f})')
print(f'Recall: {recall:.4f} (95% CI: {recall_ci[0]:.4f} - {recall_ci[1]:.4f})')
print(f'Accuracy: {accuracy:.4f} (95% CI: {accuracy_ci[0]:.4f} - {accuracy_ci[1]:.4f})')

### Changes in time to diagnosis (threshold=3)

In [None]:
# when threshold = 3

In [None]:
import pandas as pd

# Sample data
data = pd.read_csv("")

# Create a DataFrame
df_CVD_Dep= pd.DataFrame(data)

# Convert 'created_time_jittered' and 'start_date_jittered' to datetime
df_CVD_Dep['created_time_jittered'] = pd.to_datetime(df_CVD_Dep['created_time_jittered'], format='%Y-%m-%d %H:%M:%S')
df_CVD_Dep['dep_start_date_jittered'] = pd.to_datetime(df_CVD_Dep['dep_start_date_jittered'], format='%Y-%m-%d %H:%M:%S')

# Identify the second occurrence of prediction = 1 for each ID
df_CVD_Dep['occurrence'] = df_CVD_Dep[df_CVD_Dep['predictions'] == 1].groupby('anon_id').cumcount() + 1

# Calculate the time difference
df_CVD_Dep['difference'] = None  # Initialize the 'difference' column with None
df_CVD_Dep.loc[(df_CVD_Dep['predictions'] == 1) & (df_CVD_Dep['occurrence'] == 3), 'difference'] = df_CVD_Dep['dep_start_date_jittered'] - df_CVD_Dep['created_time_jittered']

# Save the DataFrame to a CSV file
df_CVD_Dep.to_csv("", index=False)

# Display the DataFrame
print(df_CVD_Dep)



In [None]:
# Filter out rows with NaN values in the 'difference' column
real_difference_CVD_Dep = df_CVD_Dep.dropna(subset=['difference'])

# Convert 'difference' column to Timedelta
real_difference_CVD_Dep['difference'] = pd.to_timedelta(real_difference_CVD_Dep['difference'])

# Convert 'difference' column to total hours
real_difference_CVD_Dep['difference_hours'] = real_difference_CVD_Dep['difference'].dt.total_seconds() / 3600

# Calculate the average of the 'difference_hours' column
average_difference_hours = real_difference_CVD_Dep['difference_hours'].mean()

# Convert the average back to Timedelta in hours
average_difference = pd.to_timedelta(average_difference_hours, unit='h')

# Add the average as a new column named 'average'
real_difference_CVD_Dep['average'] = average_difference

data.to_csv("", index=False)

# Display the updated DataFrame
print(real_difference_CVD_Dep)
