In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px

In [2]:
client = MongoClient('mongodb://localhost:27017/')
db = client['recommender_system']
admissions = db['admissions']
patients = db['patients']
diagnoses_icd = db['diagnoses_icd']
procedures_icd = db['procedures_icd']
d_icd_diagnoses = db['d_icd_diagnoses']
d_icd_procedures = db['d_icd_procedures']
nies = db['nies']

In [3]:
df_admissions = pd.DataFrame(list(admissions.find()))
df_patients = pd.DataFrame(list(patients.find()))
df_diagnoses_icd = pd.DataFrame(list(diagnoses_icd.find()))
df_procedures_icd = pd.DataFrame(list(procedures_icd.find()))
df_d_icd_diagnoses = pd.DataFrame(list(d_icd_diagnoses.find()))
df_d_icd_procedures = pd.DataFrame(list(d_icd_procedures.find()))
df_nies = pd.DataFrame(list(nies.find()))

In [4]:
merged_data = pd.merge(df_admissions, df_patients, on='subject_id', how='left')
diagnoses_merged = pd.merge(merged_data, df_diagnoses_icd, on=['hadm_id', 'subject_id'], how='left')

In [5]:
icd_to_category = {
    '250': 'Q45_QO9',  # Diabetes and related problems
    '390-459': 'Q45_QO2',  # Heart condition
    '460-519': 'Q45_QO3',  # Lung condition
    '320-389': 'Q45_QO4',  # Neurological condition
    '710-739': 'Q45_QO5',  # Orthopaedic condition
    'U07.1': 'Q45_QO6',  # COVID-19
    '001-139': 'Q45_QO7',  # Infection (other than COVID-19)
    '520-579': 'Q45_QO8',  # Digestive system condition
    '140-239': 'Q45_QO1',  # Tumour or cancer
    '960-989': 'Q45_QO10',  # Adverse reaction or poisoning
    '800-999': 'Q45_QO11',  # Injury and or accident
    '290-319': 'Q45_QO12',  # Mental health issue
    'V01-V91': 'Q45_QO13',  # Tests and or investigations
    'Unknown': 'Q45_QO14'  # Dont know or wasnt told
}
category_labels = {
    'Q45_QO1': 'Tumour or cancer',
    'Q45_QO2': 'Heart condition',
    'Q45_QO3': 'Lung condition',
    'Q45_QO4': 'Neurological condition',
    'Q45_QO5': 'Orthopaedic condition',
    'Q45_QO6': 'COVID 19',
    'Q45_QO7': 'Infection (other than COVID 19)',
    'Q45_QO8': 'Digestive system condition',
    'Q45_QO9': 'Diabetes and related problems',
    'Q45_QO10': 'Adverse reaction or poising',
    'Q45_QO11': 'Injury and or accident',
    'Q45_QO12': 'Mental health issue',
    'Q45_QO13': 'Tests and or investigations',
    'Q45_QO14': 'Dont know or wasnt told',
    'Q45_QO15': 'Other'
}

In [6]:
def classify_diagnosis(icd_code):
    if pd.isna(icd_code):
        return category_labels['Q45_QO15']
    for code_range, category in icd_to_category.items():
        if '-' in code_range:
            start, end = code_range.split('-')
            try:
                if start <= str(icd_code) <= end:
                    return category_labels[category]
            except:
                continue
        elif code_range == str(icd_code):
            return category_labels[category]
    return category_labels['Q45_QO15']

In [7]:
diagnoses_merged['condition_label'] = diagnoses_merged['icd_code'].apply(classify_diagnosis)
diagnoses_grouped = diagnoses_merged.groupby(['hadm_id', 'subject_id'])['condition_label'].apply(lambda x: ', '.join(set(x))).reset_index()
patient_data = pd.merge(merged_data, diagnoses_grouped, on=['hadm_id', 'subject_id'], how='left')

In [8]:
df_nies['gender'] = df_nies['Gender'].map({1: 'M', 2: 'F'})
admission_type_map = {
    'EU OBSERVATION': 0.0,
    'URGENT': 0.0,
    'EW EMER.': 1.0,
    'SURGICAL SAME DAY ADMISSION': 0.0,
    'DIRECT EMER.': 0.0
}
patient_data['AdmTypeBinary'] = patient_data['admission_type'].map(admission_type_map).fillna(0.0)
nies_avg_scores = df_nies.groupby(['AdmTypeBinary', 'gender', 'condition_label'])['satisfaction_score'].mean().reset_index()

In [9]:
patient_data = patient_data.assign(condition_label=patient_data['condition_label'].str.split(', ')).explode('condition_label')
patient_data = pd.merge(
    patient_data,
    nies_avg_scores,
    left_on=['AdmTypeBinary', 'gender', 'condition_label'],
    right_on=['AdmTypeBinary', 'gender', 'condition_label'],
    how='left'
)
patient_data = patient_data.groupby(['hadm_id', 'subject_id', 'admission_type', 'insurance', 'gender', 'anchor_age', 'AdmTypeBinary'])['satisfaction_score'].mean().reset_index()

In [10]:
procedures_merged = pd.merge(patient_data, df_procedures_icd, on=['hadm_id', 'subject_id'], how='left')
procedures_merged = procedures_merged.dropna(subset=['icd_code'])

In [11]:
all_procedures = set()
for codes in procedures_merged['icd_code'].dropna():
    all_procedures.add(str(codes))
matrix = pd.pivot_table(
    procedures_merged,
    values='satisfaction_score',
    index=['hadm_id', 'subject_id', 'gender', 'anchor_age', 'admission_type', 'insurance'],
    columns='icd_code',
    aggfunc='mean',
    fill_value=0
)

In [12]:
# Stage 1: Collaborative Filtering (User-User)
def collaborative_filtering_recommendations(matrix, patient_idx, n_neighbors=5, n_recommendations=5):
    patient_features = matrix.reset_index()[['gender', 'anchor_age', 'admission_type', 'insurance']]
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_features = encoder.fit_transform(patient_features[['gender', 'admission_type', 'insurance']])
    scaler = StandardScaler()
    scaled_age = scaler.fit_transform(patient_features[['anchor_age']])
    patient_features_combined = np.hstack([encoded_features, scaled_age])
    similarity_matrix = cosine_similarity(patient_features_combined)
    similarities = similarity_matrix[patient_idx]
    neighbor_indices = np.argsort(similarities)[-n_neighbors-1:-1][::-1]
    neighbor_procedures = matrix.iloc[neighbor_indices].mean(axis=0)
    patient_procedures = matrix.iloc[patient_idx]
    neighbor_procedures[patient_procedures > 0] = 0
    recommendations = neighbor_procedures.sort_values(ascending=False).head(n_recommendations)
    return recommendations.index.tolist()

In [13]:
# Stage 2: Matrix Factorization (SVD)
def svd_recommendations(matrix, n_components=10, n_recommendations=5):
    svd = TruncatedSVD(n_components=n_components)
    matrix_sparse = csr_matrix(matrix.values)
    latent_factors = svd.fit_transform(matrix_sparse)
    reconstructed_matrix = np.dot(latent_factors, svd.components_)
    recommendations = []
    for patient_idx in range(matrix.shape[0]):
        predicted_scores = reconstructed_matrix[patient_idx]
        patient_procedures = matrix.iloc[patient_idx]
        predicted_scores[patient_procedures > 0] = 0
        top_procedures = np.argsort(predicted_scores)[-n_recommendations:][::-1]
        procedures = matrix.columns[top_procedures].tolist()
        recommendations.append(procedures)
    return recommendations

In [14]:
# Stage 3: Content-Based Filtering
def content_based_recommendations(procedures_merged, df_d_icd_procedures, n_recommendations=5):
    tfidf = TfidfVectorizer(stop_words='english')
    procedure_texts = df_d_icd_procedures['long_title'].fillna('')
    tfidf_matrix = tfidf.fit_transform(procedure_texts)
    code_to_idx = {code: idx for idx, code in enumerate(df_d_icd_procedures['icd_code'])}
    recommendations = []
    for _, patient in procedures_merged.groupby(['hadm_id', 'subject_id']):
        patient_procedures = patient['icd_code'].values
        patient_vectors = np.zeros(tfidf_matrix.shape[1])
        valid_procedures = 0
        for proc in patient_procedures:
            if proc in code_to_idx:
                patient_vectors += tfidf_matrix[code_to_idx[proc]].toarray()[0]
                valid_procedures += 1
        if valid_procedures > 0:
            patient_vectors /= valid_procedures
        similarities = cosine_similarity([patient_vectors], tfidf_matrix)[0]
        top_indices = np.argsort(similarities)[-n_recommendations-1:-1][::-1]
        rec_procedures = df_d_icd_procedures.iloc[top_indices]['icd_code'].tolist()
        recommendations.append(rec_procedures)
    return recommendations

In [15]:
# Stage 4: NMF (Non-negative Matrix Factorization)
def nmf_recommendations(matrix, n_components=10, n_recommendations=5):
    nmf = NMF(n_components=n_components, init='random', random_state=42)
    matrix_sparse = csr_matrix(matrix.values)
    W = nmf.fit_transform(matrix_sparse)
    H = nmf.components_
    reconstructed_matrix = np.dot(W, H)
    recommendations = []
    for patient_idx in range(matrix.shape[0]):
        predicted_scores = reconstructed_matrix[patient_idx]
        patient_procedures = matrix.iloc[patient_idx]
        predicted_scores[patient_procedures > 0] = 0
        top_procedures = np.argsort(predicted_scores)[-n_recommendations:][::-1]
        recommendations.append(matrix.columns[top_procedures].tolist())
    return recommendations

In [16]:
# Evaluation Metrics
def evaluate_recommendations(matrix, recommendations, test_matrix):
    rmse, mae, hit_count, total = 0, 0, 0, 0
    common_indices = matrix.index.intersection(test_matrix.index)
    train_idx_map = {idx: i for i, idx in enumerate(matrix.index)}
    for test_idx in common_indices:
        if test_idx not in train_idx_map:
            continue
        train_pos = train_idx_map[test_idx]
        if train_pos >= len(recommendations) or test_idx not in test_matrix.index:
            continue
        actual = test_matrix.loc[test_idx]
        actual_procedures = actual[actual > 0].index.tolist()
        if not actual_procedures:
            continue
        predicted = recommendations[train_pos]
        predicted_scores = np.zeros(len(matrix.columns))
        for proc in predicted:
            if proc in matrix.columns:
                idx = matrix.columns.get_loc(proc)
                if idx < len(actual_scores):  # Ensure index is valid
                    predicted_scores[idx] = 1
        actual_scores = actual.values
        if len(actual_scores) == len(predicted_scores):  # Ensure dimensions match
            mse = mean_squared_error(actual_scores, predicted_scores)
            rmse += np.sqrt(mse)
            mae += mean_absolute_error(actual_scores, predicted_scores)
            hits = len(set(predicted).intersection(set(actual_procedures)))
            hit_count += hits
            total += 1
    rmse = rmse / total if total > 0 else float('nan')
    mae = mae / total if total > 0 else float('nan')
    hit_rate = hit_count / total if total > 0 else float('nan')
    satisfaction_lift = []
    for i, recs in enumerate(recommendations):
        if matrix.index[i] in common_indices:
            patient_scores = matrix.iloc[i]
            rec_scores = [patient_scores[matrix.columns.get_loc(proc)] for proc in recs if proc in matrix.columns and patient_scores[matrix.columns.get_loc(proc)] > 0]
            if rec_scores and patient_scores[patient_scores > 0].mean() > 0:
                satisfaction_lift.append(np.mean(rec_scores) - patient_scores[patient_scores > 0].mean())
    mean_satisfaction_lift = np.mean(satisfaction_lift) if satisfaction_lift else float('nan')
    return {'RMSE': rmse, 'MAE': mae, 'Hit Rate': hit_rate, 'Mean Satisfaction Lift': mean_satisfaction_lift}

In [17]:
# Interactive Visualization: Procedure-to-Procedure Transition Heatmap
def plot_transition_heatmap(matrix):
    # Select top 20 most frequent procedures
    procedure_counts = matrix.sum().sort_values(ascending=False)
    top_procedures = procedure_counts.head(20).index
    filtered_matrix = matrix[top_procedures]
    
    transition_matrix = np.zeros((len(top_procedures), len(top_procedures)))
    for i in range(len(filtered_matrix)):
        procedures = filtered_matrix.iloc[i]
        proc_indices = np.where(procedures > 0)[0]
        for j in proc_indices:
            for k in proc_indices:
                if j != k:
                    transition_matrix[j, k] += procedures.iloc[j]
    
    # Create interactive heatmap with Plotly
    fig = px.imshow(transition_matrix,
                    labels=dict(x="Procedure", y="Procedure", color="Transition Frequency"),
                    x=top_procedures,
                    y=top_procedures,
                    color_continuous_scale='Blues',
                    title="Procedure-to-Procedure Transition Effectiveness")
    fig.update_layout(
        autosize=False,
        width=800,
        height=800,
        margin=dict(l=50, r=50, t=100, b=50)
    )
    fig.show()

In [18]:
# Split data for evaluation (80% train, 20% test)
train_indices = np.random.rand(len(matrix)) < 0.8
train_matrix = matrix.iloc[train_indices].copy()
test_matrix = matrix.iloc[~train_indices].copy()

# Run Collaborative Filtering
cf_recommendations = [collaborative_filtering_recommendations(train_matrix, i) for i in range(len(train_matrix))]
cf_metrics = evaluate_recommendations(train_matrix, cf_recommendations, test_matrix)
print('Collaborative Filtering Metrics:', cf_metrics)

# Run SVD
svd_recs = svd_recommendations(train_matrix)
svd_metrics = evaluate_recommendations(train_matrix, svd_recs, test_matrix)
print('SVD Metrics:', svd_metrics)

# Run Content-Based Filtering
cb_recs = content_based_recommendations(procedures_merged, df_d_icd_procedures)
cb_metrics = evaluate_recommendations(train_matrix, cb_recs[:len(train_matrix)], test_matrix)
print('Content-Based Metrics:', cb_metrics)

# Run NMF
nmf_recs = nmf_recommendations(train_matrix)
nmf_metrics = evaluate_recommendations(train_matrix, nmf_recs, test_matrix)
print('NMF Metrics:', nmf_metrics)

# Plot interactive heatmap
plot_transition_heatmap(train_matrix)