In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
client = MongoClient('mongodb://localhost:27017/')
db = client['recommender_system']
admissions = db['admissions']
patients = db['patients']
diagnoses_icd = db['diagnoses_icd']
procedures_icd = db['procedures_icd']
d_icd_diagnoses = db['d_icd_diagnoses']
d_icd_procedures = db['d_icd_procedures']
nies = db['nies']

In [3]:
df_admissions = pd.DataFrame(list(admissions.find()))
df_patients = pd.DataFrame(list(patients.find()))
df_diagnoses_icd = pd.DataFrame(list(diagnoses_icd.find()))
df_procedures_icd = pd.DataFrame(list(procedures_icd.find()))
df_d_icd_diagnoses = pd.DataFrame(list(d_icd_diagnoses.find()))
df_d_icd_procedures = pd.DataFrame(list(d_icd_procedures.find()))
df_nies = pd.DataFrame(list(nies.find()))

In [4]:
merged_data = pd.merge(df_admissions, df_patients, on='subject_id', how='left')
diagnoses_merged = pd.merge(merged_data, df_diagnoses_icd, on=['hadm_id', 'subject_id'], how='left')

In [5]:
icd_to_category = {
    '250': 'Q45_QO9',  # Diabetes and related problems
    '390-459': 'Q45_QO2',  # Heart condition
    '460-519': 'Q45_QO3',  # Lung condition
    '320-389': 'Q45_QO4',  # Neurological condition
    '710-739': 'Q45_QO5',  # Orthopaedic condition
    'U07.1': 'Q45_QO6',  # COVID-19
    '001-139': 'Q45_QO7',  # Infection (other than COVID-19)
    '520-579': 'Q45_QO8',  # Digestive system condition
    '140-239': 'Q45_QO1',  # Tumour or cancer
    '960-989': 'Q45_QO10',  # Adverse reaction or poisoning
    '800-999': 'Q45_QO11',  # Injury and or accident
    '290-319': 'Q45_QO12',  # Mental health issue
    'V01-V91': 'Q45_QO13',  # Tests and or investigations
    'Unknown': 'Q45_QO14'  # Dont know or wasnt told
}
category_labels = {
    'Q45_QO1': 'Tumour or cancer',
    'Q45_QO2': 'Heart condition',
    'Q45_QO3': 'Lung condition',
    'Q45_QO4': 'Neurological condition',
    'Q45_QO5': 'Orthopaedic condition',
    'Q45_QO6': 'COVID 19',
    'Q45_QO7': 'Infection (other than COVID 19)',
    'Q45_QO8': 'Digestive system condition',
    'Q45_QO9': 'Diabetes and related problems',
    'Q45_QO10': 'Adverse reaction or poising',
    'Q45_QO11': 'Injury and or accident',
    'Q45_QO12': 'Mental health issue',
    'Q45_QO13': 'Tests and or investigations',
    'Q45_QO14': 'Dont know or wasnt told',
    'Q45_QO15': 'Other'
}

In [6]:
def classify_diagnosis(icd_code):
    if pd.isna(icd_code):
        return category_labels['Q45_QO15']
    for code_range, category in icd_to_category.items():
        if '-' in code_range:
            start, end = code_range.split('-')
            try:
                if start <= str(icd_code) <= end:
                    return category_labels[category]
            except:
                continue
        elif code_range == str(icd_code):
            return category_labels[category]
    return category_labels['Q45_QO15']

In [7]:
diagnoses_merged['condition_label'] = diagnoses_merged['icd_code'].apply(classify_diagnosis)
diagnoses_grouped = diagnoses_merged.groupby(['hadm_id', 'subject_id'])['condition_label'].apply(lambda x: ', '.join(set(x))).reset_index()
patient_data = pd.merge(merged_data, diagnoses_grouped, on=['hadm_id', 'subject_id'], how='left')

In [8]:
df_nies['gender'] = df_nies['Gender'].map({1: 'M', 2: 'F'})
admission_type_map = {
    'EU OBSERVATION': 0.0,
    'URGENT': 0.0,
    'EW EMER.': 1.0,
    'SURGICAL SAME DAY ADMISSION': 0.0,
    'DIRECT EMER.': 0.0
}
patient_data['AdmTypeBinary'] = patient_data['admission_type'].map(admission_type_map).fillna(0.0)
nies_avg_scores = df_nies.groupby(['AdmTypeBinary', 'gender', 'condition_label'])['satisfaction_score'].mean().reset_index()

In [9]:
patient_data = patient_data.assign(condition_label=patient_data['condition_label'].str.split(', ')).explode('condition_label')
patient_data = pd.merge(
    patient_data,
    nies_avg_scores,
    left_on=['AdmTypeBinary', 'gender', 'condition_label'],
    right_on=['AdmTypeBinary', 'gender', 'condition_label'],
    how='left'
)
patient_data = patient_data.groupby(['hadm_id', 'subject_id', 'admission_type', 'insurance', 'gender', 'anchor_age', 'AdmTypeBinary'])['satisfaction_score'].mean().reset_index()

In [10]:
procedures_merged = pd.merge(patient_data, df_procedures_icd, on=['hadm_id', 'subject_id'], how='left')
procedures_merged = procedures_merged.dropna(subset=['icd_code'])

In [11]:
all_procedures = set()
for codes in procedures_merged['icd_code'].dropna():
    all_procedures.add(str(codes))
matrix = pd.pivot_table(
    procedures_merged,
    values='satisfaction_score',
    index=['hadm_id', 'subject_id', 'gender', 'anchor_age', 'admission_type', 'insurance'],
    columns='icd_code',
    aggfunc='mean',
    fill_value=0
)
logger.info(f"Matrix shape: {matrix.shape}")
num_procedures_per_admission = (matrix > 0).sum(axis=1)
logger.info(f"Number of procedures per admission:\n{num_procedures_per_admission.describe()}")

INFO:__main__:Matrix shape: (163, 309)
INFO:__main__:Number of procedures per admission:
count    163.000000
mean       2.963190
std        2.557703
min        1.000000
25%        1.000000
50%        2.000000
75%        4.000000
max       17.000000
dtype: float64


In [12]:
# Create train and test matrices by masking some procedures
train_matrix = matrix.copy()
test_matrix = pd.DataFrame(0.0, index=matrix.index, columns=matrix.columns)

for idx in matrix.index:
    row = matrix.loc[idx]
    non_zero_cols = row[row > 0].index
    if len(non_zero_cols) >= 2:
        num_test = max(1, int(len(non_zero_cols) * 0.2))
        test_procs = np.random.choice(non_zero_cols, num_test, replace=False)
        logger.debug(f"Admission {idx}: Masking {num_test} procedures: {test_procs}")
        train_matrix.loc[idx, test_procs] = 0.0
        test_matrix.loc[idx, test_procs] = row[test_procs]

# Verify test_matrix has non-zero values
num_test_procs = (test_matrix > 0).sum().sum()
num_admissions_with_test = (test_matrix.sum(axis=1) > 0).sum()
logger.info(f"Number of test procedures: {num_test_procs}")
logger.info(f"Number of admissions with test procedures: {num_admissions_with_test}")
assert num_test_procs > 0, "No procedures were masked for testing. Check data or masking logic."
assert train_matrix.index.equals(test_matrix.index), "Train and test matrices have different indices."
assert not test_matrix.isna().any().any(), "Test matrix contains NaN values."
assert not train_matrix.isna().any().any(), "Train matrix contains NaN values."

INFO:__main__:Number of test procedures: 113
INFO:__main__:Number of admissions with test procedures: 107


In [None]:
# Stage 1: Collaborative Filtering (User-User)
def collaborative_filtering_recommendations(train_matrix, patient_idx, n_neighbors=3, n_recommendations=5):
    patient_features = train_matrix.reset_index()[['gender', 'anchor_age', 'admission_type', 'insurance']]
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_features = encoder.fit_transform(patient_features[['gender', 'admission_type', 'insurance']])
    scaler = StandardScaler()
    scaled_age = scaler.fit_transform(patient_features[['anchor_age']])
    patient_features_combined = np.hstack([encoded_features, scaled_age])
    similarity_matrix = cosine_similarity(patient_features_combined)
    similarities = similarity_matrix[patient_idx]
    neighbor_indices = np.argsort(similarities)[-n_neighbors-1:-1][::-1]
    neighbor_procedures = train_matrix.iloc[neighbor_indices].mean(axis=0)
    patient_procedures = train_matrix.iloc[patient_idx]
    neighbor_procedures[patient_procedures > 0] = 0
    recommendations = neighbor_procedures.sort_values(ascending=False).head(n_recommendations)
    return recommendations.index.tolist()

In [None]:
# Stage 2: Matrix Factorization (SVD)
def svd_recommendations(train_matrix, n_components=10, n_recommendations=5):
    n_components = min(n_components, train_matrix.shape[1]-1, train_matrix.shape[0]-1)
    svd = TruncatedSVD(n_components=n_components)
    matrix_sparse = csr_matrix(train_matrix.values)
    latent_factors = svd.fit_transform(matrix_sparse)
    reconstructed_matrix = np.dot(latent_factors, svd.components_)
    recommendations = []
    for patient_idx in range(train_matrix.shape[0]):
        predicted_scores = reconstructed_matrix[patient_idx]
        patient_procedures = train_matrix.iloc[patient_idx]
        predicted_scores[patient_procedures > 0] = 0
        top_procedures = np.argsort(predicted_scores)[-n_recommendations:][::-1]
        procedures = train_matrix.columns[top_procedures].tolist()
        recommendations.append(procedures)
    return recommendations

In [None]:
# Stage 3: Content-Based Filtering
def content_based_recommendations(procedures_merged, train_matrix, df_d_icd_procedures, n_recommendations=5):
    tfidf = TfidfVectorizer(stop_words='english')
    procedure_texts = df_d_icd_procedures['long_title'].fillna('')
    tfidf_matrix = tfidf.fit_transform(procedure_texts)
    code_to_idx = {code: idx for idx, code in enumerate(df_d_icd_procedures['icd_code'])}
    recommendations = []
    for idx in train_matrix.index:
        hadm_id, subject_id = idx[0], idx[1]
        patient_procs = train_matrix.loc[idx][train_matrix.loc[idx] > 0].index.tolist()
        patient_vectors = np.zeros(tfidf_matrix.shape[1])
        valid_procedures = 0
        for proc in patient_procs:
            if proc in code_to_idx:
                patient_vectors += tfidf_matrix[code_to_idx[proc]].toarray()[0]
                valid_procedures += 1
        if valid_procedures > 0:
            patient_vectors /= valid_procedures
        similarities = cosine_similarity([patient_vectors], tfidf_matrix)[0]
        top_indices = np.argsort(similarities)[-n_recommendations-1:-1][::-1]
        rec_procedures = df_d_icd_procedures.iloc[top_indices]['icd_code'].tolist()
        recommendations.append(rec_procedures)
    return recommendations

In [None]:
# Stage 4: NMF (Non-negative Matrix Factorization)
def nmf_recommendations(train_matrix, n_components=10, n_recommendations=5):
    n_components = min(n_components, train_matrix.shape[1]-1, train_matrix.shape[0]-1)
    nmf = NMF(n_components=n_components, init='random', random_state=42)
    matrix_sparse = csr_matrix(train_matrix.values)
    W = nmf.fit_transform(matrix_sparse)
    H = nmf.components_
    reconstructed_matrix = np.dot(W, H)
    recommendations = []
    for patient_idx in range(train_matrix.shape[0]):
        predicted_scores = reconstructed_matrix[patient_idx]
        patient_procedures = train_matrix.iloc[patient_idx]
        predicted_scores[patient_procedures > 0] = 0
        top_procedures = np.argsort(predicted_scores)[-n_recommendations:][::-1]
        recommendations.append(train_matrix.columns[top_procedures].tolist())
    return recommendations

In [17]:
# Evaluation Metrics
def evaluate_recommendations(train_matrix, recommendations, test_matrix):
    hit_count = 0
    total = 0
    rmse, mae = 0, 0
    satisfaction_lift = []
    for i, idx in enumerate(train_matrix.index):
        if idx not in test_matrix.index:
            logger.debug(f"Skipping {idx}: Not in test matrix")
            continue
        test_procs = test_matrix.loc[idx][test_matrix.loc[idx] > 0].index.tolist()
        if not test_procs:
            logger.debug(f"Skipping {idx}: No test procedures")
            continue
        recs = recommendations[i]
        hits = len(set(recs).intersection(set(test_procs)))
        if hits > 0:
            hit_count += 1
        total += 1
        logger.debug(f"Admission {idx}: Test procs {test_procs}, Recs {recs}, Hits {hits}")
        # Compute RMSE and MAE for SVD and NMF
        predicted_scores = np.zeros(len(train_matrix.columns))
        for proc in recs:
            if proc in train_matrix.columns:
                predicted_scores[train_matrix.columns.get_loc(proc)] = 1
        actual_scores = test_matrix.loc[idx].values
        mse = mean_squared_error(actual_scores, predicted_scores)
        rmse += np.sqrt(mse)
        mae += mean_absolute_error(actual_scores, predicted_scores)
        # Satisfaction lift
        rec_scores = [test_matrix.loc[idx][proc] for proc in recs if proc in test_matrix.columns and test_matrix.loc[idx][proc] > 0]
        if rec_scores:
            mean_test_scores = test_matrix.loc[idx][test_matrix.loc[idx] > 0].mean()
            if not np.isnan(mean_test_scores):
                satisfaction_lift.append(np.mean(rec_scores) - mean_test_scores)
    hit_rate = hit_count / total if total > 0 else float('nan')
    rmse = rmse / total if total > 0 else float('nan')
    mae = mae / total if total > 0 else float('nan')
    mean_satisfaction_lift = np.mean(satisfaction_lift) if satisfaction_lift else float('nan')
    logger.info(f"Total evaluations: {total}, Hit count: {hit_count}, Hit rate: {hit_rate}")
    return {'RMSE': rmse, 'MAE': mae, 'Hit Rate': hit_rate, 'Mean Satisfaction Lift': mean_satisfaction_lift}

In [18]:
# Interactive Visualization: Procedure-to-Procedure Transition Effectiveness
def plot_transition_heatmap(train_matrix):
    procedure_counts = train_matrix.sum().sort_values(ascending=False)
    top_procedures = procedure_counts.head(20).index
    filtered_matrix = train_matrix[top_procedures]
    transition_matrix = np.zeros((len(top_procedures), len(top_procedures)))
    for i in range(len(filtered_matrix)):
        procedures = filtered_matrix.iloc[i]
        proc_indices = np.where(procedures > 0)[0]
        for j in proc_indices:
            for k in proc_indices:
                if j != k:
                    transition_matrix[j, k] += procedures.iloc[j]
    fig = px.imshow(transition_matrix,
                    labels=dict(x="Procedure", y="Procedure", color="Transition Frequency"),
                    x=top_procedures,
                    y=top_procedures,
                    color_continuous_scale='Blues',
                    title="Procedure-to-Procedure Transition Effectiveness")
    fig.update_layout(autosize=False, width=800, height=800, margin=dict(l=50, r=50, t=100, b=50))
    fig.show()

In [25]:
# Generate recommendations
cf_recommendations = [collaborative_filtering_recommendations(train_matrix, i) for i in range(len(train_matrix))]
cf_metrics = evaluate_recommendations(train_matrix, cf_recommendations, test_matrix)
print('Collaborative Filtering Metrics:', cf_metrics)

svd_recs = svd_recommendations(train_matrix)
svd_metrics = evaluate_recommendations(train_matrix, svd_recs, test_matrix)
print('SVD Metrics:', svd_metrics)
# Run Content-Based Filtering
cb_recs = content_based_recommendations(procedures_merged, train_matrix, df_d_icd_procedures)
cb_metrics = evaluate_recommendations(train_matrix, cb_recs, test_matrix)
print('Content-Based Metrics:', cb_metrics)

# Run NMF
nmf_recs = nmf_recommendations(train_matrix)
nmf_metrics = evaluate_recommendations(train_matrix, nmf_recs, test_matrix)
print('NMF Metrics:', nmf_metrics)

# Plot interactive heatmap
plot_transition_heatmap(train_matrix)

INFO:__main__:Total evaluations: 107, Hit count: 3, Hit rate: 0.028037383177570093
INFO:__main__:Total evaluations: 107, Hit count: 10, Hit rate: 0.09345794392523364


Collaborative Filtering Metrics: {'RMSE': 0.10849146707459321, 'MAE': 0.01232019284580152, 'Hit Rate': 0.028037383177570093, 'Mean Satisfaction Lift': 0.0}
SVD Metrics: {'RMSE': 0.10667981759308062, 'MAE': 0.01196661419872068, 'Hit Rate': 0.09345794392523364, 'Mean Satisfaction Lift': 1.1102230246251566e-17}


INFO:__main__:Total evaluations: 107, Hit count: 2, Hit rate: 0.018691588785046728
INFO:__main__:Total evaluations: 107, Hit count: 6, Hit rate: 0.056074766355140186


Content-Based Metrics: {'RMSE': 0.0758516350178299, 'MAE': 0.006812027154722522, 'Hit Rate': 0.018691588785046728, 'Mean Satisfaction Lift': 0.0}
NMF Metrics: {'RMSE': 0.10771345936068843, 'MAE': 0.0121630848413627, 'Hit Rate': 0.056074766355140186, 'Mean Satisfaction Lift': 1.850371707708594e-17}


In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import joblib
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def collaborative_filtering_recommendations(train_matrix, patient_idx, n_neighbors=5, n_recommendations=5, encoder=None, scaler=None):
    """
    Collaborative Filtering recommendations for a patient, returning encoder and scaler.
    
    Parameters:
    - train_matrix (pd.DataFrame): Training matrix with admissions as rows, procedure ICD codes as columns.
    - patient_idx (int): Index of the patient in train_matrix.
    - n_neighbors (int): Number of similar patients to consider.
    - n_recommendations (int): Number of procedures to recommend.
    - encoder (OneHotEncoder, optional): Pre-trained encoder for categorical features.
    - scaler (StandardScaler, optional): Pre-trained scaler for age.
    
    Returns:
    - list: Recommended procedure ICD codes.
    - OneHotEncoder: Encoder used for categorical features (new or provided).
    - StandardScaler: Scaler used for age (new or provided).
    """
    patient_features = train_matrix.reset_index()[['gender', 'anchor_age', 'admission_type', 'insurance']]
    
    # Initialize encoder and scaler if not provided
    if encoder is None:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_features = encoder.fit_transform(patient_features[['gender', 'admission_type', 'insurance']])
    else:
        encoded_features = encoder.transform(patient_features[['gender', 'admission_type', 'insurance']])
    
    if scaler is None:
        scaler = StandardScaler()
        scaled_age = scaler.fit_transform(patient_features[['anchor_age']])
    else:
        scaled_age = scaler.transform(patient_features[['anchor_age']])
    
    patient_features_combined = np.hstack([encoded_features, scaled_age])
    similarity_matrix = cosine_similarity(patient_features_combined)
    similarities = similarity_matrix[patient_idx]
    neighbor_indices = np.argsort(similarities)[-n_neighbors-1:-1][::-1]
    neighbor_procedures = train_matrix.iloc[neighbor_indices].mean(axis=0)
    patient_procedures = train_matrix.iloc[patient_idx]
    neighbor_procedures[patient_procedures > 0] = 0
    recommendations = neighbor_procedures.sort_values(ascending=False).head(n_recommendations).index.tolist()
    
    return recommendations, encoder, scaler

def svd_recommendations(train_matrix, n_components=10, n_recommendations=5, save_model=True):
    """
    SVD recommendations, returning the trained model and saving it optionally.
    """
    n_components = min(n_components, train_matrix.shape[1]-1, train_matrix.shape[0]-1)
    svd = TruncatedSVD(n_components=n_components)
    matrix_sparse = csr_matrix(train_matrix.values)
    latent_factors = svd.fit_transform(matrix_sparse)
    reconstructed_matrix = np.dot(latent_factors, svd.components_)
    recommendations = []
    for patient_idx in range(train_matrix.shape[0]):
        predicted_scores = reconstructed_matrix[patient_idx]
        patient_procedures = train_matrix.iloc[patient_idx]
        predicted_scores[patient_procedures > 0] = 0
        top_procedures = np.argsort(predicted_scores)[-n_recommendations:][::-1]
        procedures = train_matrix.columns[top_procedures].tolist()
        recommendations.append(procedures)
    
    # Save SVD model
    if save_model:
        joblib.dump(svd, 'svd_model.joblib')
    return recommendations, svd

def recommend_procedures_for_new_patient(new_patient, train_matrix, df_d_icd_procedures, svd_model=None, encoder=None, scaler=None, n_recommendations=5, n_neighbors=5):
    """
    Recommend procedures for a new patient using Collaborative Filtering and SVD.
    
    Parameters:
    - new_patient (dict): Dictionary with patient data (subject_id, hadm_id, gender, anchor_age, admission_type, insurance, condition_label, known_procedures).
    - train_matrix (pd.DataFrame): Training matrix with admissions as rows, procedure ICD codes as columns.
    - df_d_icd_procedures (pd.DataFrame): DataFrame with procedure ICD codes and descriptions.
    - svd_model (TruncatedSVD, optional): Trained SVD model. Loads from 'svd_model.joblib' if None.
    - encoder (OneHotEncoder, optional): Trained encoder. Loads from 'onehot_encoder.joblib' if None.
    - scaler (StandardScaler, optional): Trained scaler. Loads from 'standard_scaler.joblib' if None.
    - n_recommendations (int): Number of procedures to recommend.
    - n_neighbors (int): Number of neighbors for Collaborative Filtering.
    
    Returns:
    - list: List of dictionaries with recommended procedure ICD codes and descriptions.
    """
    # Load models if not provided
    if encoder is None:
        if os.path.exists('onehot_encoder.joblib'):
            encoder = joblib.load('onehot_encoder.joblib')
        else:
            raise FileNotFoundError("OneHotEncoder not provided and 'onehot_encoder.joblib' not found.")
    if scaler is None:
        if os.path.exists('standard_scaler.joblib'):
            scaler = joblib.load('standard_scaler.joblib')
        else:
            raise FileNotFoundError("StandardScaler not provided and 'standard_scaler.joblib' not found.")
    if svd_model is None:
        if os.path.exists('svd_model.joblib'):
            svd_model = joblib.load('svd_model.joblib')
        else:
            raise FileNotFoundError("SVD model not provided and 'svd_model.joblib' not found.")
    
    # Prepare new patient data
    patient_features = pd.DataFrame({
        'gender': [new_patient['gender']],
        'anchor_age': [new_patient['anchor_age']],
        'admission_type': [new_patient['admission_type']],
        'insurance': [new_patient['insurance']]
    })
    
    # Encode categorical features and scale age
    try:
        encoded_features = encoder.transform(patient_features[['gender', 'admission_type', 'insurance']])
        scaled_age = scaler.transform(patient_features[['anchor_age']])
    except ValueError as e:
        logger.error(f"Feature encoding/scaling error: {e}")
        raise ValueError("New patient data contains values not seen during training (e.g., unknown admission_type or insurance).")
    
    patient_features_combined = np.hstack([encoded_features, scaled_age])
    
    # Collaborative Filtering: Find similar patients
    train_features = train_matrix.reset_index()[['gender', 'anchor_age', 'admission_type', 'insurance']]
    train_encoded = encoder.transform(train_features[['gender', 'admission_type', 'insurance']])
    train_scaled_age = scaler.transform(train_features[['anchor_age']])
    train_features_combined = np.hstack([train_encoded, train_scaled_age])
    
    similarities = cosine_similarity(patient_features_combined, train_features_combined)[0]
    neighbor_indices = np.argsort(similarities)[-n_neighbors:][::-1]
    neighbor_procedures = train_matrix.iloc[neighbor_indices].mean(axis=0)
    
    # Remove known procedures
    known_procs = new_patient.get('known_procedures', [])
    neighbor_procedures[known_procs] = 0
    cf_recommendations = neighbor_procedures.sort_values(ascending=False).head(n_recommendations).index.tolist()
    
    # SVD: Predict procedure scores
    new_patient_vector = pd.Series(0.0, index=train_matrix.columns)
    for proc in known_procs:
        if proc in train_matrix.columns:
            new_patient_vector[proc] = 1.0
    matrix_sparse = csr_matrix(new_patient_vector.values.reshape(1, -1))
    latent_factors = svd_model.transform(matrix_sparse)
    predicted_scores = np.dot(latent_factors, svd_model.components_)[0]
    
    # Remove known procedures
    predicted_scores[new_patient_vector > 0] = 0
    svd_recommendations = train_matrix.columns[np.argsort(predicted_scores)[-n_recommendations:][::-1]].tolist()
    
    # Combine recommendations
    combined_recommendations = list(set(cf_recommendations + svd_recommendations))
    if len(combined_recommendations) > n_recommendations:
        cf_scores = neighbor_procedures[combined_recommendations] / (neighbor_procedures.max() or 1)
        svd_scores = pd.Series(predicted_scores, index=train_matrix.columns)[combined_recommendations] / (predicted_scores.max() or 1)
        combined_scores = (cf_scores + svd_scores).fillna(0)
        combined_recommendations = combined_scores.sort_values(ascending=False).head(n_recommendations).index.tolist()
    
    # Map ICD codes to descriptions
    recommendations = []
    for icd_code in combined_recommendations:
        description = df_d_icd_procedures[df_d_icd_procedures['icd_code'] == icd_code]['long_title'].iloc[0] if icd_code in df_d_icd_procedures['icd_code'].values else "Unknown"
        recommendations.append({"icd_code": icd_code, "description": description})
    
    return recommendations


In [33]:

# Example execution (assumes train_matrix, df_d_icd_procedures, etc. from notebook)
if __name__ == "__main__":
    # Initialize encoder and scaler with first patient
    first_recs, encoder, scaler = collaborative_filtering_recommendations(train_matrix, 0)
    
    # Generate recommendations for all patients
    cf_recommendations = [first_recs] + [
        collaborative_filtering_recommendations(train_matrix, i, encoder=encoder, scaler=scaler)[0]
        for i in range(1, len(train_matrix))
    ]
    cf_metrics = evaluate_recommendations(train_matrix, cf_recommendations, test_matrix)
    print('Collaborative Filtering Metrics:', cf_metrics)
    
    # Generate SVD recommendations
    svd_recs, svd_model = svd_recommendations(train_matrix)
    svd_metrics = evaluate_recommendations(train_matrix, svd_recs, test_matrix)
    print('SVD Metrics:', svd_metrics)
    
    # Save encoder and scaler (already saved in first call, but ensure consistency)
    joblib.dump(encoder, 'onehot_encoder.joblib')
    joblib.dump(scaler, 'standard_scaler.joblib')
    
    # Sample new patient
    new_patient = {
        "subject_id": 999999,
        "hadm_id": 2999999,
        "gender": "W",
        "anchor_age": 75,
        "admission_type": "EW EMER.",
        "insurance": "Medicare",
        "condition_label": ["Lung condition", "Diabetes and related problems"],
        "known_procedures": []
    }
    
    # Recommend procedures for new patient
    try:
        recommendations = recommend_procedures_for_new_patient(
            new_patient, train_matrix, df_d_icd_procedures, svd_model, encoder, scaler
        )
        print("\nRecommended Procedures for New Patient:")
        for rec in recommendations:
            print(f"ICD Code: {rec['icd_code']}, Description: {rec['description']}")
    except Exception as e:
        logger.error(f"Error recommending procedures: {e}")

INFO:__main__:Total evaluations: 107, Hit count: 6, Hit rate: 0.056074766355140186
INFO:__main__:Total evaluations: 107, Hit count: 10, Hit rate: 0.09345794392523364


Collaborative Filtering Metrics: {'RMSE': 0.1345558347893827, 'MAE': 0.018655070776233795, 'Hit Rate': 0.056074766355140186, 'Mean Satisfaction Lift': 0.0}
SVD Metrics: {'RMSE': 0.13371637216392432, 'MAE': 0.018439106108105785, 'Hit Rate': 0.09345794392523364, 'Mean Satisfaction Lift': 1.1102230246251566e-17}

Recommended Procedures for New Patient:
ICD Code: 966, Description: Enteral infusion of concentrated nutritional substances
ICD Code: 3783, Description: Initial insertion of dual-chamber device
ICD Code: 3897, Description: Central venous catheter placement with guidance
ICD Code: 9904, Description: Transfusion of packed cells
ICD Code: 3323, Description: Other bronchoscopy


# Improvment 

In [45]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import joblib
import os
import logging
from pymongo.collection import Collection
import re

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Category labels from notebook (Cell 5), extended for ICD-10 diabetes codes
category_labels = {
    '001-139': 'Infectious and parasitic diseases',
    '140-239': 'Tumour or cancer',
    '240-279': 'Endocrine, nutritional and metabolic diseases',
    '250': 'Diabetes and related problems',
    'E10-E14': 'Diabetes and related problems',  # ICD-10 diabetes codes
    '280-289': 'Diseases of the blood and blood-forming organs',
    '290-319': 'Mental disorders',
    '320-389': 'Nervous system and sense organs diseases',
    '390-459': 'Heart condition',
    '460-519': 'Respiratory system diseases',
    '520-579': 'Digestive system diseases',
    '580-629': 'Genitourinary system diseases',
    '630-679': 'Complications of pregnancy, childbirth, and the puerperium',
    '680-709': 'Skin and subcutaneous tissue diseases',
    '710-739': 'Musculoskeletal system and connective tissue diseases',
    '740-759': 'Congenital anomalies',
    '760-779': 'Conditions originating in the perinatal period',
    '780-799': 'Symptoms, signs, and ill-defined conditions',
    '800-999': 'Injury and poisoning',
    'U07.1': 'COVID-19',
    'V01-Z99': 'Health status and contact with health services'
}

def normalize_icd_code(code):
    """
    Normalize ICD code for consistent comparison.
    """
    if pd.isna(code):
        return ''
    code = str(code).strip().replace('.', '').upper()
    return re.sub(r'[^A-Za-z0-9]', '', code)

def map_procedure_to_conditions(procedure_icd_code, diagnoses_icd, d_icd_diagnoses, procedures_icd, category_labels):
    """
    Map a procedure ICD code to condition labels via associated diagnoses.
    """
    if isinstance(diagnoses_icd, Collection):
        diagnoses_icd = pd.DataFrame(list(diagnoses_icd.find({}, {'hadm_id': 1, 'subject_id': 1, 'icd_code': 1, '_id': 0})))
    if isinstance(d_icd_diagnoses, Collection):
        d_icd_diagnoses = pd.DataFrame(list(d_icd_diagnoses.find({}, {'icd_code': 1, 'long_title': 1, '_id': 0})))
    if isinstance(procedures_icd, Collection):
        procedures_icd = pd.DataFrame(list(procedures_icd.find({}, {'hadm_id': 1, 'icd_code': 1, '_id': 0})))

    if diagnoses_icd.empty or procedures_icd.empty:
        logger.warning(f"No data for procedure {procedure_icd_code}. Returning empty conditions.")
        return []

    try:
        procedure_icd_code = normalize_icd_code(procedure_icd_code)
        proc_admissions = procedures_icd[procedures_icd['icd_code'].apply(normalize_icd_code) == procedure_icd_code][['hadm_id']]
        admissions_with_proc = diagnoses_icd[diagnoses_icd['hadm_id'].isin(proc_admissions['hadm_id'])]
        diag_codes = admissions_with_proc['icd_code'].apply(normalize_icd_code).unique()

        conditions = set()
        for diag_code in diag_codes:
            if not diag_code:
                continue
            for icd_range, condition in category_labels.items():
                if '-' in icd_range:
                    start, end = icd_range.split('-')
                    start, end = normalize_icd_code(start), normalize_icd_code(end)
                    if start and end and start <= diag_code <= end:
                        conditions.add(condition)
                elif normalize_icd_code(icd_range) == diag_code:
                    conditions.add(condition)
        
        if not conditions:
            logger.debug(f"No conditions mapped for procedure {procedure_icd_code}")
        
        return list(conditions)
    except Exception as e:
        logger.error(f"Error mapping procedure {procedure_icd_code} to conditions: {e}")
        return []

def collaborative_filtering_recommendations(train_matrix, patient_idx, n_neighbors=5, n_recommendations=5, encoder=None, scaler=None):
    """
    Collaborative Filtering recommendations.
    """
    patient_features = train_matrix.reset_index()[['gender', 'anchor_age', 'admission_type', 'insurance']]
    
    if encoder is None:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_features = encoder.fit_transform(patient_features[['gender', 'admission_type', 'insurance']])
    else:
        encoded_features = encoder.transform(patient_features[['gender', 'admission_type', 'insurance']])
    
    if scaler is None:
        scaler = StandardScaler()
        scaled_age = scaler.fit_transform(patient_features[['anchor_age']])
    else:
        scaled_age = scaler.transform(patient_features[['anchor_age']])
    
    patient_features_combined = np.hstack([encoded_features, scaled_age])
    similarity_matrix = cosine_similarity(patient_features_combined)
    similarities = similarity_matrix[patient_idx]
    neighbor_indices = np.argsort(similarities)[-n_neighbors-1:-1][::-1]
    neighbor_procedures = train_matrix.iloc[neighbor_indices].mean(axis=0)
    patient_procedures = train_matrix.iloc[patient_idx]
    neighbor_procedures[patient_procedures > 0] = 0
    recommendations = neighbor_procedures.sort_values(ascending=False).head(n_recommendations).index.tolist()
    
    return recommendations, encoder, scaler

def svd_recommendations(train_matrix, n_components=10, n_recommendations=5, save_model=True):
    """
    SVD recommendations with fixed random seed.
    """
    np.random.seed(42)
    n_components = min(n_components, train_matrix.shape[1]-1, train_matrix.shape[0]-1)
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    matrix_sparse = csr_matrix(train_matrix.values)
    latent_factors = svd.fit_transform(matrix_sparse)
    reconstructed_matrix = np.dot(latent_factors, svd.components_)
    recommendations = []
    for patient_idx in range(train_matrix.shape[0]):
        predicted_scores = reconstructed_matrix[patient_idx]
        patient_procedures = train_matrix.iloc[patient_idx]
        predicted_scores[patient_procedures > 0] = 0
        top_procedures = np.argsort(predicted_scores)[-n_recommendations:][::-1]
        procedures = train_matrix.columns[top_procedures].tolist()
        recommendations.append(procedures)
    
    if save_model:
        joblib.dump(svd, 'svd_model.joblib')
    return recommendations, svd

def recommend_procedures_for_new_patient(new_patient, train_matrix, df_d_icd_procedures, diagnoses_icd, d_icd_diagnoses, procedures_icd, svd_model=None, encoder=None, scaler=None, n_recommendations=5, n_neighbors=5):
    """
    Recommend procedures for a new patient, filtering by condition labels.
    """
    if encoder is None:
        if os.path.exists('onehot_encoder.joblib'):
            encoder = joblib.load('onehot_encoder.joblib')
        else:
            raise FileNotFoundError("OneHotEncoder not provided and 'onehot_encoder.joblib' not found.")
    if scaler is None:
        if os.path.exists('standard_scaler.joblib'):
            scaler = joblib.load('standard_scaler.joblib')
        else:
            raise FileNotFoundError("StandardScaler not provided and 'standard_scaler.joblib' not found.")
    if svd_model is None:
        if os.path.exists('svd_model.joblib'):
            svd_model = joblib.load('svd_model.joblib')
        else:
            raise FileNotFoundError("SVD model not provided and 'svd_model.joblib' not found.")
    
    # Validate input
    if not isinstance(new_patient.get('hadm_id'), (int, str)) or not str(new_patient['hadm_id']).isdigit():
        logger.error("Invalid hadm_id: must be numeric")
        raise ValueError("hadm_id must be numeric")
    if new_patient.get('gender') not in ['M', 'F']:
        logger.error("Invalid gender: must be 'M' or 'F'")
        raise ValueError("gender must be 'M' or 'F'")
    valid_conditions = list(category_labels.values())
    if not all(cond in valid_conditions for cond in new_patient.get('condition_label', [])):
        logger.warning(f"Invalid condition labels: {new_patient.get('condition_label', [])}. Using unfiltered recommendations.")
        new_patient['condition_label'] = []

    patient_features = pd.DataFrame({
        'gender': [new_patient['gender']],
        'anchor_age': [new_patient['anchor_age']],
        'admission_type': [new_patient['admission_type']],
        'insurance': [new_patient['insurance']]
    })
    
    try:
        encoded_features = encoder.transform(patient_features[['gender', 'admission_type', 'insurance']])
        scaled_age = scaler.transform(patient_features[['anchor_age']])
    except ValueError as e:
        logger.error(f"Feature encoding/scaling error: {e}")
        raise ValueError("New patient data contains values not seen during training.")
    
    patient_features_combined = np.hstack([encoded_features, scaled_age])
    
    # Collaborative Filtering
    train_features = train_matrix.reset_index()[['gender', 'anchor_age', 'admission_type', 'insurance']]
    train_encoded = encoder.transform(train_features[['gender', 'admission_type', 'insurance']])
    train_scaled_age = scaler.transform(train_features[['anchor_age']])
    train_features_combined = np.hstack([train_encoded, train_scaled_age])
    
    similarities = cosine_similarity(patient_features_combined, train_features_combined)[0]
    neighbor_indices = np.argsort(similarities)[-n_neighbors:][::-1]
    neighbor_procedures = train_matrix.iloc[neighbor_indices].mean(axis=0)
    known_procs = new_patient.get('known_procedures', [])
    neighbor_procedures[known_procs] = 0
    cf_recommendations = neighbor_procedures.sort_values(ascending=False).head(n_recommendations*2).index.tolist()
    
    # SVD
    new_patient_vector = pd.Series(0.0, index=train_matrix.columns)
    for proc in known_procs:
        if proc in train_matrix.columns:
            new_patient_vector[proc] = 1.0
    matrix_sparse = csr_matrix(new_patient_vector.values.reshape(1, -1))
    latent_factors = svd_model.transform(matrix_sparse)
    predicted_scores = np.dot(latent_factors, svd_model.components_)[0]
    predicted_scores[new_patient_vector > 0] = 0
    svd_recommendations = train_matrix.columns[np.argsort(predicted_scores)[-n_recommendations*2:][::-1]].tolist()
    
    # Combine recommendations with scores
    combined_recommendations = list(set(cf_recommendations + svd_recommendations))
    cf_scores = neighbor_procedures[combined_recommendations] / (neighbor_procedures.max() or 1)
    svd_scores = pd.Series(predicted_scores, index=train_matrix.columns)[combined_recommendations] / (predicted_scores.max() or 1)
    combined_scores = (cf_scores + svd_scores).fillna(0)
    
    # Filter by condition labels
    patient_conditions = new_patient.get('condition_label', [])
    filtered_recommendations = []
    if patient_conditions:
        condition_weights = {'Heart condition': 0.7, 'Diabetes and related problems': 0.3}  # Weight heart higher due to emergency context
        for icd_code in combined_recommendations:
            proc_conditions = map_procedure_to_conditions(icd_code, diagnoses_icd, d_icd_diagnoses, procedures_icd, category_labels)
            if proc_conditions:
                for cond in proc_conditions:
                    if cond in patient_conditions:
                        score_boost = condition_weights.get(cond, 1.0)
                        filtered_recommendations.append((icd_code, combined_scores[icd_code] * score_boost))
        if filtered_recommendations:
            filtered_recommendations = [code for code, _ in sorted(filtered_recommendations, key=lambda x: x[1], reverse=True)[:n_recommendations]]
        else:
            logger.warning("No procedures matched patient conditions. Using top-scored recommendations.")
            filtered_recommendations = combined_scores.sort_values(ascending=False).head(n_recommendations).index.tolist()
    else:
        filtered_recommendations = combined_scores.sort_values(ascending=False).head(n_recommendations).index.tolist()
    
    # Map ICD codes to descriptions
    recommendations = []
    for icd_code in filtered_recommendations:
        norm_code = normalize_icd_code(icd_code)
        match = df_d_icd_procedures[df_d_icd_procedures['icd_code'].apply(normalize_icd_code) == norm_code]
        if match.empty:
            logger.warning(f"No description found for ICD code {icd_code} (normalized: {norm_code})")
            description = "Unknown"
        else:
            description = match['long_title'].iloc[0]
        recommendations.append({"icd_code": icd_code, "description": description})
    
    return recommendations


In [46]:

# Example execution
if __name__ == "__main__":
    # Initialize encoder and scaler
    first_recs, encoder, scaler = collaborative_filtering_recommendations(train_matrix, 0)
    
    # Generate recommendations
    cf_recommendations = [first_recs] + [
        collaborative_filtering_recommendations(train_matrix, i, encoder=encoder, scaler=scaler)[0]
        for i in range(1, len(train_matrix))
    ]
    cf_metrics = evaluate_recommendations(train_matrix, cf_recommendations, test_matrix)
    print('Collaborative Filtering Metrics:', cf_metrics)
    
    svd_recs, svd_model = svd_recommendations(train_matrix)
    svd_metrics = evaluate_recommendations(train_matrix, svd_recs, test_matrix)
    print('SVD Metrics:', svd_metrics)
    
    # Save models
    joblib.dump(encoder, 'onehot_encoder.joblib')
    joblib.dump(scaler, 'standard_scaler.joblib')
    
    # Sample new patient
    new_patient = {
        "subject_id": 999999,
        "hadm_id": "2999999",
        "gender": "M",
        "anchor_age": 65,
        "admission_type": "EW EMER.",
        "insurance": "Medicare",
        "condition_label": ["Heart condition", "Diabetes and related problems"],
        "known_procedures": []
    }
    
    # Recommend procedures
    try:
        recommendations = recommend_procedures_for_new_patient(
            new_patient, train_matrix, df_d_icd_procedures, diagnoses_icd, d_icd_diagnoses, procedures_icd,
            svd_model, encoder, scaler
        )
        print("\nRecommended Procedures for New Patient:")
        for rec in recommendations:
            print(f"ICD Code: {rec['icd_code']}, Description: {rec['description']}")
    except Exception as e:
        logger.error(f"Error recommending procedures: {e}")

INFO:__main__:Total evaluations: 107, Hit count: 6, Hit rate: 0.056074766355140186
INFO:__main__:Total evaluations: 107, Hit count: 12, Hit rate: 0.11214953271028037


Collaborative Filtering Metrics: {'RMSE': 0.1345558347893827, 'MAE': 0.018655070776233795, 'Hit Rate': 0.056074766355140186, 'Mean Satisfaction Lift': 0.0}
SVD Metrics: {'RMSE': 0.13332546625849678, 'MAE': 0.018341563109975943, 'Hit Rate': 0.11214953271028037, 'Mean Satisfaction Lift': 9.25185853854297e-18}

Recommended Procedures for New Patient:
ICD Code: 8856, Description: Coronary arteriography using two catheters
ICD Code: 3722, Description: Left heart cardiac catheterization
ICD Code: 8853, Description: Angiocardiography of left heart structures
ICD Code: 3607, Description: Insertion of drug-eluting coronary artery stent(s)
ICD Code: 45, Description: Insertion of one vascular stent


In [44]:
df_d_icd_procedures

Unnamed: 0,_id,icd_code,icd_version,long_title
0,6855581f36e4026fa6f982d6,1,9,Therapeutic ultrasound of vessels of head and ...
1,6855581f36e4026fa6f982d7,2,9,Therapeutic ultrasound of heart
2,6855581f36e4026fa6f982d8,3,9,Therapeutic ultrasound of peripheral vascular ...
3,6855581f36e4026fa6f982d9,9,9,Other therapeutic ultrasound
4,6855581f36e4026fa6f982da,1,10,"Central Nervous System and Cranial Nerves, Bypass"
...,...,...,...,...
85252,6855582036e4026fa6facfda,XW0DXV5,10,Introduction of Gilteritinib Antineoplastic in...
85253,6855582036e4026fa6facfdb,XXE,10,"New Technology, Physiological Systems, Measure..."
85254,6855582036e4026fa6facfdc,XXE5XM5,10,"Measurement of Infection, Whole Blood Nucleic ..."
85255,6855582036e4026fa6facfdd,XY0,10,"New Technology, Extracorporeal, Introduction"
