In [75]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import subprocess
import sys
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import ParameterSampler

# Install pgeocode for geographic distance calculation
try:
    import pgeocode
except ImportError:
    print("Installing pgeocode...")
    subprocess.run([sys.executable, "-m", "pip", "install", "pgeocode"], check=True)
    import pgeocode

try:
    from ethnicolr import census_ln
except ImportError:
    print("Installing ethnicolr...")
    # Note: ethnicolr requires TensorFlow. This installation might take a moment.
    subprocess.run([sys.executable, "-m", "pip", "install", "ethnicolr"], check=True)
    from ethnicolr import census_ln

import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)

import os
from sklearn.model_selection import ParameterGrid, cross_validate
from sklearn.model_selection import ParameterSampler
import joblib


In [56]:
#file locations
parquet_file_paths={
    "patient": r"Client_Data_files\Parquets\synthetic_patients.parquet",
    "encounter": r"Client_Data_files\Parquets\synthetic_encounters.parquet",
    "hospitals": r"Client_Data_files\Parquets\synthetic_hospitals.parquet",
    "provider": r"Client_Data_files\Parquets\synthetic_providers.parquet",    
}

# Reading the parquet files
patient_df = pd.read_parquet(parquet_file_paths['patient'])
encounter_df = pd.read_parquet(parquet_file_paths['encounter'])
hospital_df = pd.read_parquet(parquet_file_paths['hospitals'])
provider_df = pd.read_parquet(parquet_file_paths['provider'])

print("Dataframes loaded successfully.")
print(f"Patient DF shape: {patient_df.shape}")
print(f"Encounter DF shape: {encounter_df.shape}")
print(f"Provider DF shape: {provider_df.shape}")
print(f"Hospital DF shape: {hospital_df.shape}")

Dataframes loaded successfully.
Patient DF shape: (100000, 16)
Encounter DF shape: (200000, 17)
Provider DF shape: (5000, 19)
Hospital DF shape: (200, 14)


In [57]:
# Creating a data map for easy access
data_map={
    "patient": patient_df,
    "encounter": encounter_df,
    "hospitals": hospital_df,
    "provider": provider_df
}

for key, df in data_map.items():
    print(f"Dataframe: {key}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")    
    for col in df.columns:
        if df[col].isna().sum() > 0:
            print(f"Column '{col}' has {df[col].isna().sum()} missing values.")
    print("")

Dataframe: patient
Shape: (100000, 16)
Columns: ['patient_id', 'first_name', 'last_name', 'date_of_birth', 'gender', 'race', 'ethnicity', 'primary_language', 'zip_code', 'insurance_type', 'household_income', 'education_level', 'age', 'cultural_background', 'preferred_provider_language', 'cultural_preferences']

Dataframe: encounter
Shape: (200000, 17)
Columns: ['encounter_id', 'patient_id', 'provider_id', 'encounter_date', 'encounter_type', 'primary_diagnosis', 'length_of_stay', 'total_cost', 'cultural_background', 'primary_language', 'languages_spoken', 'cultural_competency_rating', 'cultural_match_score', 'language_match', 'patient_satisfaction', 'treatment_adherence', 'return_visit_30_days']

Dataframe: hospitals
Shape: (200, 14)
Columns: ['hospital_id', 'hospital_name', 'hospital_type', 'zip_code', 'bed_count', 'teaching_hospital', 'trauma_center', 'language_services_available', 'cultural_competency_program', 'interpreter_services_24_7', 'community_health_programs', 'overall_rating

In [58]:
patient_df_temp=pd.DataFrame()
patient_df_temp=patient_df[patient_df['race']=='Hispanic or Latino'][['patient_id','first_name','last_name','race','ethnicity']].copy()
patient_race_pred=census_ln(patient_df_temp, 'last_name')

2025-09-29 10:16:14,035 - INFO - Preserving 12965 duplicate rows based on column 'last_name'
2025-09-29 10:16:14,051 - INFO - Data filtering summary: 12997 → 12997 rows (kept 100.0%)
2025-09-29 10:16:14,053 - INFO - Merging demographic data for 12997 records...
2025-09-29 10:16:14,214 - INFO - Matched 12997 of 12997 rows (100.0%)
2025-09-29 10:16:14,214 - INFO - Added columns: pct2prace, pctaian, pctapi, pctblack, pcthispanic, pctwhite


In [59]:
race_mapping={
    'white': 'White',
    'black': 'Black or African American',
    'api': 'Asian',    
    'aian': 'Native American',
    '2prace': 'Other'
}

In [60]:
race_cols=['pctwhite','pctblack','pctapi','pctaian','pct2prace']
patient_race_pred['derived_race'] = patient_race_pred[race_cols].idxmax(axis=1).str.replace('pct', '').map(race_mapping)

# Create a mapping from patient_id to derived_race
id_to_derived_race = dict(zip(patient_race_pred['patient_id'], patient_race_pred['derived_race']))

# Update the race column only for Hispanic or Latino patients
patient_df.loc[patient_df['race'] == 'Hispanic or Latino', 'race'] = \
    patient_df.loc[patient_df['race'] == 'Hispanic or Latino', 'patient_id'].map(id_to_derived_race)

In [61]:
provider_race_predictions = census_ln(provider_df, 'last_name')

# Derive race for the provider_df as it is missing from the source data
print("Deriving race for providers from last names...")
race_cols = ['pctwhite','pctblack','pctapi','pctaian','pct2prace']
provider_df['provider_race'] = provider_race_predictions[race_cols].idxmax(axis=1).str.replace('pct', '').map(race_mapping)
print("Provider race derivation complete.")

# Deriving provider ethnicity from the race_predictions
print("Deriving ethnicity for providers from race predictions...")
provider_df['provider_ethnicity'] = provider_race_predictions['pcthispanic'].apply(lambda x: 'Hispanic or Latino' if float(x) >= 50 else 'Not Hispanic or Latino')
print("Provider ethnicity derivation complete.")


2025-09-29 10:16:14,301 - INFO - Preserving 4968 duplicate rows based on column 'last_name'
2025-09-29 10:16:14,301 - INFO - Data filtering summary: 5000 → 5000 rows (kept 100.0%)
2025-09-29 10:16:14,301 - INFO - Merging demographic data for 5000 records...
2025-09-29 10:16:14,362 - INFO - Matched 5000 of 5000 rows (100.0%)
2025-09-29 10:16:14,363 - INFO - Added columns: pct2prace, pctaian, pctapi, pctblack, pcthispanic, pctwhite


Deriving race for providers from last names...
Provider race derivation complete.
Deriving ethnicity for providers from race predictions...
Provider ethnicity derivation complete.


In [62]:
for key, df in data_map.items():
    print(f"Dataframe: {key}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")    
    for col in df.columns:
        if df[col].isna().sum() > 0:
            print(f"Column '{col}' has {df[col].isna().sum()} missing values.")
    print("")

Dataframe: patient
Shape: (100000, 16)
Columns: ['patient_id', 'first_name', 'last_name', 'date_of_birth', 'gender', 'race', 'ethnicity', 'primary_language', 'zip_code', 'insurance_type', 'household_income', 'education_level', 'age', 'cultural_background', 'preferred_provider_language', 'cultural_preferences']

Dataframe: encounter
Shape: (200000, 17)
Columns: ['encounter_id', 'patient_id', 'provider_id', 'encounter_date', 'encounter_type', 'primary_diagnosis', 'length_of_stay', 'total_cost', 'cultural_background', 'primary_language', 'languages_spoken', 'cultural_competency_rating', 'cultural_match_score', 'language_match', 'patient_satisfaction', 'treatment_adherence', 'return_visit_30_days']

Dataframe: hospitals
Shape: (200, 14)
Columns: ['hospital_id', 'hospital_name', 'hospital_type', 'zip_code', 'bed_count', 'teaching_hospital', 'trauma_center', 'language_services_available', 'cultural_competency_program', 'interpreter_services_24_7', 'community_health_programs', 'overall_rating

### Step 2: Feature Engineering

Here, we merge the datasets and create the features our model will learn from. This includes cultural matches, language matches, and geographic distance.

In [63]:
# Merge all data into a single master DataFrame for training
master_df = pd.merge(encounter_df, patient_df, on='patient_id',suffixes=('', '_pat'))
master_df = pd.merge(master_df, provider_df, on='provider_id',suffixes=('', '_prov'))
master_df = pd.merge(master_df, hospital_df, left_on='hospital_affiliation', right_on='hospital_id',suffixes=('', '_hosp'))

print("Master DataFrame created with shape:", master_df.shape)

Master DataFrame created with shape: (200000, 66)


In [64]:
master_df.columns

Index(['encounter_id', 'patient_id', 'provider_id', 'encounter_date',
       'encounter_type', 'primary_diagnosis', 'length_of_stay', 'total_cost',
       'cultural_background', 'primary_language', 'languages_spoken',
       'cultural_competency_rating', 'cultural_match_score', 'language_match',
       'patient_satisfaction', 'treatment_adherence', 'return_visit_30_days',
       'first_name', 'last_name', 'date_of_birth', 'gender', 'race',
       'ethnicity', 'primary_language_pat', 'zip_code', 'insurance_type',
       'household_income', 'education_level', 'age', 'cultural_background_pat',
       'preferred_provider_language', 'cultural_preferences', 'npi_number',
       'first_name_prov', 'last_name_prov', 'specialty', 'practice_zip_code',
       'years_experience', 'medical_school_country', 'board_certified',
       'languages_spoken_prov', 'interpreter_services',
       'cultural_certifications', 'minority_health_experience',
       'community_involvement', 'patient_satisfaction_sc

In [65]:

# --- Engineer the Match Features ---

# Stateless Cultural Features
master_df['race_match'] = (master_df['race'] == master_df['provider_race']).astype(int)
master_df['ethnicity_match'] = (master_df['ethnicity'] == master_df['provider_ethnicity']).astype(int)
master_df['language_match'] = (master_df['language_match'] == True).astype(int)


master_df['encounter_date'] = pd.to_datetime(master_df['encounter_date'])

# Geographic Feature
dist = pgeocode.GeoDistance('US') # Assuming US zip codes
# Calculate distance between patient and provider zip codes
master_df['distance_km'] = dist.query_postal_code(
    master_df['zip_code'].astype(str).tolist(), 
    master_df['zip_code_hosp'].astype(str).tolist()
)
# Calculate the mean distance for each provider specialty
# The .transform('mean') creates a Series with the same index as master_df,
mean_dist_by_specialty = master_df.groupby('specialty')['distance_km'].transform('mean')

# Now, fill the missing distances using these specialty-specific averages
master_df['distance_km'].fillna(mean_dist_by_specialty, inplace=True)

# If any specialties had NO valid distances, there might still be NaNs.
# Fill any remaining with the overall mean as a final fallback.
master_df['distance_km'].fillna(master_df['distance_km'].mean(), inplace=True)



# Create a function to apply the transformations consistently
def create_features(df,  min_dist, max_dist):
    df_eng = df.copy() # Work on a copy to avoid SettingWithCopyWarning
    
    
    # Proximity Score
    df_eng['proximity_score'] = 1 - ((df_eng['distance_km'] - min_dist) / (max_dist - min_dist))
    df_eng['proximity_score'] = df_eng['proximity_score'].clip(0, 1)
    
    return df_eng



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  master_df['distance_km'].fillna(mean_dist_by_specialty, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  master_df['distance_km'].fillna(master_df['distance_km'].mean(), inplace=True)


In [66]:
# # # 1. First, calculate the min and max distances from data
# # min_distance = master_df['distance_km'].min()
# # max_distance = master_df['distance_km'].max()

# # # 2. Apply the scaling formula to create a score from 0 to 1
# # # The formula is: 1 - ( (x - min) / (max - min) )

# # master_df['proximity_score'] = 1 - (
# #     (master_df['distance_km'] - min_distance) / (max_distance - min_distance)
# # )

# # master_df['encounter_date'] = pd.to_datetime(master_df['encounter_date'])
# # df_sorted = master_df.sort_values(by=['patient_id', 'encounter_date'])

# # average_adherence = df_sorted['treatment_adherence'].mean()

# # df_sorted['shifted_adherence'] = df_sorted.groupby('patient_id')['treatment_adherence'].shift(1)

# # # 2. Now, calculate the cumulative sum on this correctly shifted column.
# # df_sorted['treatment_adherence_sum'] = df_sorted.groupby('patient_id')['shifted_adherence'].cumsum()
# # df_sorted['treatment_adherence_sum'].fillna(0, inplace=True)

# # df_sorted['treatment_adherence_count'] = df_sorted.groupby('patient_id').cumcount()

# # df_sorted['historical_avg_adherence'] = np.where(
# #     df_sorted['treatment_adherence_count'] > 0,                                        # The condition to check
# #     df_sorted['treatment_adherence_sum'] / df_sorted['treatment_adherence_count'],     # The value if the condition is True
# #     average_adherence                                                                # The value if the condition is False
# # )


# # # Add the new feature back to the original master_df
# # master_df['historical_avg_adherence'] = df_sorted['historical_avg_adherence']



# # print("Feature engineering complete.")
# # master_df[['race_match', 'ethnicity_match', 'language_match', 'distance_km','proximity_score']].head()
# print(master_df['proximity_score'].isna().sum())
# master_df['proximity_score'].describe()

In [67]:
#master_df.to_csv('master_df_v1_5.csv', index=False)

### Step 3: Training and Testing the Model

This is the core machine learning section. We split our data, train the model, and then test it on unseen data to validate its performance. The feature importances are the **learned weights**.

In [81]:
Model_Name="RandomForestRegressor_CompositeSuccessScore"
Model_code='RFR_CompScore'
Model_Summary= \
"""This report details a Segmented Learning to Rank (LTR) system built to generate personalized doctor recommendations. 
The system utilizes machine learning to predict a composite success_score, which holistically combines patient satisfaction and treatment adherence.
The core of the approach is its segmented architecture: instead of a single general model, a separate "expert" Random Forest model is trained 
for each patient's cultural_preference group. 
Each model is optimized through hyperparameter tuning to learn the unique importance of various features—including 
cultural fit, language match, and geographic proximity—for its specific audience. 
All models were rigorously validated on a held-out test set to ensure reliable and unbiased performance. 
"""

# --- Training Section ---

# 1. Normalize the columns to a 0-1 scale
scaler = MinMaxScaler()
master_df[['satisfaction_norm', 'adherence_norm']] = scaler.fit_transform(
    master_df[['patient_satisfaction', 'treatment_adherence']]
)

# 2. Define weights and create the composite score
adherence_weight = 0
satisfaction_weight = 1
master_df['success_score'] = (
    master_df['adherence_norm'] * adherence_weight +
    master_df['satisfaction_norm'] * satisfaction_weight
)

# Rename provider competency column to avoid conflict
master_df.rename(columns={'cultural_competency_rating_y': 'cultural_competency_rating_prov'}, inplace=True)

# Define the features to be used by the models
features_stateless = [
    'years_experience',
    'cultural_competency_rating_prov',
    'communication_rating',
    'race_match',
    'ethnicity_match',
    'language_match',    
    'interpreter_services_24_7',    
]
features_stateful = [
    
    'proximity_score',  
]

features=features_stateless + features_stateful

target = 'success_score'

# Get the unique preference categories to loop through
unique_preferences = master_df['cultural_preferences'].unique()
print("Unique cultural preferences found: ", unique_preferences)



Unique cultural preferences found:  ['No Specific Preference' 'Culturally Similar Provider'
 'Culturally Similar Provider; Same Language Provider'
 'Same Language Provider']


In [None]:
# A dictionary to store a trained model for each preference type



trained_models = {}
learned_weights_dict = {}
best_hyperparameters = {} 
test_metrics_dict = {}
feature_engineering_params = {} # To store scaling values like min/max dist
all_run_logs_dict = {}

for i, preference in enumerate(unique_preferences):
    print(f"Training model for cultural preference: {preference} ({i+1}/{len(unique_preferences)})--------------------------------------")
    
    # Filter the DataFrame for the current preference
    segment_df= master_df[master_df['cultural_preferences'] == preference].copy()

    # Check if the segment is large enough to train a model
    if len(segment_df) < 100: # You can adjust this threshold
        print(f"Segment is too small to train a reliable model. Skipping.\n")
        continue

    # Split the data into training and testing sets
    X_segment = segment_df.drop(columns=[target])
    y_segment = segment_df[target]

    # 1. Split data into a training+validation set (80%) and a final test set (20%)
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X_segment, y_segment, test_size=0.2, random_state=42
    )

    # --- 2. stateful FEATURE ENGINEERING (Applied Separately to Train and Test) ---
    
    # Learn parameters ONLY from the training set
    print("Learning feature engineering parameters from the training set...")
    # avg_adherence_train = X_train_val['treatment_adherence'].mean()
    min_dist_train = X_train_val['distance_km'].min()
    max_dist_train = X_train_val['distance_km'].max()

    # Store these learned parameters for later use in production/inference
    feature_engineering_params[preference] = {
        # 'avg_adherence': avg_adherence_train,
        'min_dist': min_dist_train,
        'max_dist': max_dist_train
    }

    # Apply the function to both train and test sets
    X_train_val = create_features(X_train_val,  min_dist_train, max_dist_train)
    X_test = create_features(X_test,  min_dist_train, max_dist_train)

    X_train_val=X_train_val[features]
    X_test=X_test[features]

    print(f"training on features: {X_train_val.columns.tolist()}")
    print(f"Training+Validation set size: {X_train_val.shape[0]} samples")
    print(f"Test set size: {X_test.shape[0]} samples")

    # --- Hyperparameter Tuning Section (using the validation data implicitly via CV) ---

    print("\nStarting random parameter search ...")

    # 1. Define the hyperparameter grid (as before)
    param_grid = {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [10, 20, 30, None],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', 1.0]
    }

    # 2. Create the ParameterSampler iterator
    # This will generate 50 random, unique combinations to test.
    n_iterations = 50 
    param_sampler = ParameterSampler(
        param_grid, 
        n_iter=n_iterations, 
        random_state=42)

    total_combinations = n_iterations # Calculate total runs

    # 3. List to store the results of every single run
    all_run_logs = []

    # 4. Loop through each parameter combination
    for run_num, params in enumerate(param_sampler):
        print(f"  > Running trial {run_num + 1}/{total_combinations}...", end='\r')
        model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
        
        # Define the metrics to calculate during cross-validation
        scoring_metrics = {
            'neg_mse': 'neg_mean_squared_error',
            'mae': 'neg_mean_absolute_error',
            'r2': 'r2'
        }
        
        # Perform 5-fold cross-validation, returning the trained estimators
        cv_results = cross_validate(
            model, X_train_val, y_train_val, cv=5,
            scoring=scoring_metrics,
            return_estimator=True # CRITICAL: This gives us access to the models
        )
        
        # Calculate mean feature importances across the 5 folds
        fold_importances = [est.feature_importances_ for est in cv_results['estimator']]
        mean_importances = np.mean(fold_importances, axis=0)
        
        # Store all the results in our log list
        all_run_logs.append({
            'params': params,
            'mean_rmse': np.sqrt(-np.mean(cv_results['test_neg_mse'])),
            'mean_mae': -np.mean(cv_results['test_mae']),
            'mean_r2': np.mean(cv_results['test_r2']),
            'feature_importances': dict(zip(features, mean_importances))
        })
    print(f"\nSearch complete after {total_combinations} trials. Analyzing results...")

    # 5. Convert logs to a DataFrame for easy analysis
    results_df = pd.DataFrame(all_run_logs)
    results_df = results_df.sort_values(by='mean_rmse', ascending=True)

    print("\n--- Top 5 Hyperparameter Runs (based on lowest validation RMSE) ---")
    print(results_df[['params', 'mean_rmse', 'mean_mae', 'mean_r2']].head())
    
    # 6. Get the best parameters and train the final model on all training data
    best_params = results_df.iloc[0]['params']
    print(f"\nBest parameters found: {best_params}")
    best_hyperparameters[preference] = best_params
    
    best_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
    best_model.fit(X_train_val, y_train_val)
    all_run_logs_dict[preference] = results_df

    # --- Final Testing Section ---

    print("\n--- Final Evaluation on the Held-Out Test Set ---")
    print(f"Test set size: {X_test.shape[0]} encounters")
    
    # 6. Make predictions on the unseen test data
    final_predictions = best_model.predict(X_test)

    # 7. Evaluate the final model's performance
    rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
    mae = mean_absolute_error(y_test, final_predictions)
    r2 = r2_score(y_test, final_predictions)

    print("\n--- Final Model Validation Metrics ---")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE):     {mae:.4f}")
    print(f"R-squared (R²):                {r2:.4f}")

    # store the test run results 
    # You can store these metrics in a dictionary or DataFrame if needed
    test_metrics = {
        'preference': preference,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'test_set_size': X_test.shape[0]
    }

    # 8. Inspect and store the results from the best model
    learned_weights = pd.Series(best_model.feature_importances_, index=features).sort_values(ascending=False)
    print("\n--- Learned Feature Weights for this Segment ---")
    print(learned_weights)

    test_metrics_dict[preference] = test_metrics
    trained_models[preference] = best_model
    learned_weights_dict[preference] = learned_weights
    print(f"\n--- Best model for '{preference}' trained and stored ----------------------------------------------------------------")


Training model for cultural preference: No Specific Preference (1/4)--------------------------------------
Learning feature engineering parameters from the training set...
training on features: ['years_experience', 'cultural_competency_rating_prov', 'communication_rating', 'race_match', 'ethnicity_match', 'language_match', 'interpreter_services_24_7', 'proximity_score']
Training+Validation set size: 69151 samples
Test set size: 17288 samples

Starting manual GRID search (will test all combinations)...
  > Running trial 50/50...
Search complete after 50 trials. Analyzing results...

--- Top 5 Hyperparameter Runs (based on lowest validation RMSE) ---
                                               params  mean_rmse  mean_mae  \
49  {'n_estimators': 200, 'min_samples_leaf': 4, '...   0.166953  0.148241   
9   {'n_estimators': 150, 'min_samples_leaf': 2, '...   0.166959  0.148238   
1   {'n_estimators': 200, 'min_samples_leaf': 2, '...   0.166967  0.148249   
20  {'n_estimators': 200, 'min_

In [70]:
results_df.head()

Unnamed: 0,params,mean_rmse,mean_mae,mean_r2,feature_importances
23,"{'max_depth': 10, 'max_features': 'log2', 'min...",0.204832,0.177088,0.212688,"{'years_experience': 0.03661728201525532, 'cul..."
22,"{'max_depth': 10, 'max_features': 'log2', 'min...",0.204851,0.177114,0.212551,"{'years_experience': 0.036699380918818414, 'cu..."
21,"{'max_depth': 10, 'max_features': 'log2', 'min...",0.204867,0.177109,0.212431,"{'years_experience': 0.0367017636575055, 'cult..."
20,"{'max_depth': 10, 'max_features': 'log2', 'min...",0.204902,0.177101,0.212162,"{'years_experience': 0.037324154652589606, 'cu..."
19,"{'max_depth': 10, 'max_features': 'log2', 'min...",0.204925,0.177138,0.211981,"{'years_experience': 0.039459263437687755, 'cu..."


In [82]:
import pandas as pd
from openpyxl.styles import Font # Import Font for styling

# --- Report Generation Section ---
# This code should be run AFTER your training loop is complete.

print("\\n--- Generating Final Model Summary Report ---")

try:
    # 1. Consolidate the dictionaries into clean DataFrames
    metrics_df = pd.DataFrame(test_metrics_dict).T
    weights_df = pd.DataFrame(learned_weights_dict).reset_index().rename(columns={'index': 'feature'})
    hyperparams_df = pd.DataFrame(best_hyperparameters).reset_index().rename(columns={'index': 'hyperparameter'})

    # create a folder 'Summary_reports' if it doesn't exist
    if not os.path.exists('Summary_reports'):
        os.makedirs('Summary_reports')
    

    # 2. Use pandas ExcelWriter to create the report
    timestamp = pd.Timestamp.now().strftime("%m%d_%H%M")
    with pd.ExcelWriter(f'Summary_reports/{Model_code}_report_{timestamp}.xlsx', engine='openpyxl') as writer:
        sheet_name = 'Model_Summary'
        
        # --- NEW: Add Title and Summary at the top of the sheet ---
        
        # Get the workbook and worksheet objects
        workbook  = writer.book
        # Create the sheet if it doesn't exist, otherwise get it
        if sheet_name not in workbook.sheetnames:
            worksheet = workbook.create_sheet(sheet_name)
        else:
            worksheet = workbook[sheet_name]
        writer.sheets[sheet_name] = worksheet

        # a) Add Model Name (Title)
        
        worksheet['A1'] = Model_Name
        worksheet['A1'].font = Font(bold=True, size=14)

        # b) Add Model Summary
        model_summary = Model_Summary
        worksheet.merge_cells('A3:J8') # Merge cells for a nice text block
        worksheet['A3'] = model_summary
        worksheet['A3'].alignment = worksheet['A3'].alignment.copy(wrap_text=True, vertical='top')

        # --- Write DataFrames (with updated start rows) ---
        
        # Define the starting row for the first table, leaving space for the title and summary
        current_row = 11 

        # a) Write Metrics Table
        worksheet.cell(row=current_row, column=1).value = "Performance Metrics"
        worksheet.cell(row=current_row, column=1).font = Font(bold=True)
        metrics_df.to_excel(writer, sheet_name=sheet_name, startrow=current_row, index=True)
        current_row += metrics_df.shape[0] + 4 # Update current_row for the next table

        # b) Write Weights Table
        worksheet.cell(row=current_row, column=1).value = "Learned Feature Weights"
        worksheet.cell(row=current_row, column=1).font = Font(bold=True)
        weights_df.to_excel(writer, sheet_name=sheet_name, startrow=current_row, index=False)
        current_row += weights_df.shape[0] + 4

        # c) Write Hyperparameters Table
        worksheet.cell(row=current_row, column=1).value = "Best Hyperparameters"
        worksheet.cell(row=current_row, column=1).font = Font(bold=True)
        hyperparams_df.to_excel(writer, sheet_name=sheet_name, startrow=current_row, index=False)

        sheet_name = 'All_run_metrics'
        # Create the sheet if it doesn't exist, otherwise get it
        if sheet_name not in workbook.sheetnames:
            worksheet = workbook.create_sheet(sheet_name)
        else:
            worksheet = workbook[sheet_name]
        writer.sheets[sheet_name] = worksheet

        # Write all run metrics
        for i, (preference, df) in enumerate(all_run_logs_dict.items()):
            start_row = i * (df.shape[0] + 4) + 1 # Leave 4 rows between tables
            worksheet.cell(row=start_row, column=1).value = f"All Run Metrics for Preference: {preference}"
            worksheet.cell(row=start_row, column=1).font = Font(bold=True)
            df.to_excel(writer, sheet_name=sheet_name, startrow=start_row + 1, index=False)
        

    print(f"\\nSuccessfully created 'model_summary_report_final.xlsx' with title, summary, and all results on the '{sheet_name}' sheet.")

except Exception as e:
    print(f"\\nAn error occurred during report generation: {e}")
    print("Please ensure the training loop completed successfully and populated the result dictionaries.")

\n--- Generating Final Model Summary Report ---


  worksheet['A3'].alignment = worksheet['A3'].alignment.copy(wrap_text=True, vertical='top')


\nSuccessfully created 'model_summary_report_final.xlsx' with title, summary, and all results on the 'All_run_metrics' sheet.


In [80]:
import datetime
import joblib


# --- This code runs AFTER your training loop is complete ---

# 1. First, get the min and max distance from your full training data
min_dist_training = master_df['distance_km'].min()
max_dist_training = master_df['distance_km'].max()

# 2. Create a dictionary to hold all the objects you need to save
artifacts_to_save = {
    'models': trained_models,
    'feature_engineering_params': feature_engineering_params,
    
}

# 3. Save the dictionary to a single file using joblib
file_path = f'{Model_code}_artifacts.joblib'

# Create folder 'Model_Artifacts' if it doesn't exist
if not os.path.exists('Model_Artifacts'):
    os.makedirs('Model_Artifacts')
file_path = os.path.join('Model_Artifacts', file_path)

# Check if a file already exists at that path
if os.path.exists(file_path):
    print(f"Existing file found at '{file_path}'. Archiving it.")
    
    # Create a timestamp string (e.g., "20250929_091303")
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Split the original file path into its base name and extension
    base_name, extension = os.path.splitext(file_path)
    
    # Create the new name for the *old* file by inserting the timestamp
    archive_file_path = f"{base_name}_{timestamp}{extension}"
    
    # Rename the old file
    os.rename(file_path, archive_file_path)
    print(f"Renamed existing file to '{archive_file_path}'")



joblib.dump(artifacts_to_save, file_path)

print(f"\\n--- Training artifacts saved successfully to '{file_path}' ---")
print("This file contains:")
print(f"- {len(artifacts_to_save['models'])} trained model(s)")
# print(f"- Training Min Distance: {artifacts_to_save['min_dist']:.2f}")
# print(f"- Training Max Distance: {artifacts_to_save['max_dist']:.2f}")

Existing file found at 'Model_Artifacts\RFR_CompScore_artifacts.joblib'. Archiving it.
Renamed existing file to 'Model_Artifacts\RFR_CompScore_artifacts_20250929_124016.joblib'
\n--- Training artifacts saved successfully to 'Model_Artifacts\RFR_CompScore_artifacts.joblib' ---
This file contains:
- 4 trained model(s)


In [42]:
weights_df = pd.DataFrame(learned_weights_dict).reset_index(drop=False).rename(columns={'index': 'feature'})
hyperparams_df = pd.DataFrame(best_hyperparameters)


In [43]:
print(f"Total models trained and stored: {len(trained_models)}")
print(f'learned_weights_dict keys: {learned_weights_dict.keys()}')

Total models trained and stored: 4
learned_weights_dict keys: dict_keys(['No Specific Preference', 'Culturally Similar Provider', 'Culturally Similar Provider; Same Language Provider', 'Same Language Provider'])


In [44]:
# # --- Training Section ---

# # 1. Normalize the columns to a 0-1 scale
# scaler = MinMaxScaler()
# master_df[['satisfaction_norm', 'adherence_norm']] = scaler.fit_transform(
#     master_df[['patient_satisfaction', 'treatment_adherence']]
# )

# # 2. Define weights and create the composite score
# adherence_weight = 0.5
# satisfaction_weight = 0.5
# master_df['success_score'] = (
#     master_df['adherence_norm'] * adherence_weight +
#     master_df['satisfaction_norm'] * satisfaction_weight
# )

# # Rename provider competency column to avoid conflict
# master_df.rename(columns={'cultural_competency_rating_y': 'cultural_competency_rating_prov'}, inplace=True)

# # Define the features to be used by the models
# features = [
#     'years_experience',
#     'cultural_competency_rating_prov',
#     'communication_rating',
#     'race_match',
#     'ethnicity_match',
#     'language_match',
#     'proximity_score',
#     'interpreter_services_24_7'
# ]
# target = 'success_score'

# # --- Segmented Training and Testing ---

# # Get the unique preference categories to loop through
# unique_preferences = master_df['cultural_preferences'].unique()
# print("Unique cultural preferences found: ", unique_preferences)

# # A dictionary to store a trained model for each preference type
# trained_models = {}
# learned_weights_dict = {}

# i=1
 
# for preference in unique_preferences:
#     print(f'preference segment {i} of {len(unique_preferences)}')
#     print(f"Processing Segment: '{preference}' ---")
    
#     # Create a subset of the data for the current preference type
#     segment_df = master_df[master_df['cultural_preferences'] == preference].copy()
    
#     # Check if the segment is large enough to train a model
#     if len(segment_df) < 50: # You can adjust this threshold
#         print(f"Segment is too small to train a reliable model. Skipping.\n")
#         continue

#     # --- Training Section for the Segment ---
    
#     X_segment = segment_df[features]
#     y_segment = segment_df[target]

#     # 1. Split the segment's data into a training set (80%) and a testing set (20%)
#     X_train, X_test, y_train, y_test = train_test_split(X_segment, y_segment, test_size=0.2, random_state=42)

#     print(f"Training set size: {X_train.shape[0]} encounters")
#     print(f"Testing set size: {X_test.shape[0]} encounters")

#     # 2. Initialize and train a new Random Forest model for this segment
#     model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
#     model.fit(X_train, y_train)

#     # --- Testing Section for the Segment ---

#     # 3. Make predictions on the unseen test data for this segment
#     predictions = model.predict(X_test)

#     # 4. Evaluate the model's performance
#     rmse = np.sqrt(mean_squared_error(y_test, predictions))
#     mae = mean_absolute_error(y_test, predictions)
#     r2 = r2_score(y_test, predictions)

#     print("\n--- Model Validation Metrics ---")
#     print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
#     print(f"Mean Absolute Error (MAE):     {mae:.4f}")
#     print(f"R-squared (R²):                {r2:.4f}")

#     # 5. Inspect the 'Learned Weights' (Feature Importances) for this specific model
#     learned_weights = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
#     print("\n--- Learned Feature Weights for this Segment ---")
#     print(learned_weights)
    
#     # 6. Store the trained model
#     trained_models[preference] = model
#     learned_weights_dict[preference] = learned_weights
#     print(f"--- Model for '{preference}' trained and stored ---\n")
#     i=i+1

In [45]:
# convert dictionary to dataframe
learned_weights_df = pd.DataFrame(learned_weights)
learned_weights_df.index.name = 'cultural_preferences'

In [46]:
for key in learned_weights_dict.keys():
    print(f"Preference: {key}")
    print(learned_weights_dict[key].tolist())


Preference: No Specific Preference
[0.30228134356710645, 0.2587297265880648, 0.22706208377863857, 0.15078194310374118, 0.023369602943468116, 0.019298029084975906, 0.018477270934005034, 0.0]
Preference: Culturally Similar Provider
[0.27944612196356666, 0.24964349079718437, 0.23633818785769584, 0.1643383978767721, 0.024942262879027864, 0.023974037786980082, 0.021317500838773054, 0.0]
Preference: Culturally Similar Provider; Same Language Provider
[0.8291991791822869, 0.0616554138482501, 0.03980004991148326, 0.03006834916789543, 0.02568487766919535, 0.00490442265660199, 0.0045826925833540415, 0.0041050149809329]
Preference: Same Language Provider
[0.8799899038622955, 0.04331520487270709, 0.02771284454615456, 0.024068233884902663, 0.017199764006714646, 0.0028091691752090285, 0.002541019572876003, 0.002363860079140613]


# Convert learned_weights dictionary to a DataFrame
learned_weights_df = pd.DataFrame(learned_weights)
learned_weights_df.index.name = 'feature'
display(learned_weights_df)mendations for a new patient.

In [47]:
def get_recommendations(patient_id, required_specialty, 
                        # DataFrames
                        all_providers_df, 
                        all_patients_df, 
                        all_hospitals_df,
                        # Trained Models & Scaling Data
                        models_dict,
                        training_min_dist, 
                        training_max_dist):
    """
    Generates ranked doctor recommendations using the segmented model strategy.
    
    Args:
        patient_id (int): The ID of the patient seeking a recommendation.
        required_specialty (str): The medical specialty required.
        all_providers_df (pd.DataFrame): The full provider dataframe.
        all_patients_df (pd.DataFrame): The full patient dataframe.
        all_hospitals_df (pd.DataFrame): The full hospital dataframe.
        models_dict (dict): The dictionary of trained models for each preference segment.
        training_min_dist (float): The minimum distance calculated from the full training set.
        training_max_dist (float): The maximum distance calculated from the full training set.
        
    Returns:
        pd.DataFrame: A ranked dataframe of recommended providers.
    """
    print(f"\n--- Starting Recommendation Phase for Patient ID: {patient_id} ---")
    
    # --- Step 1: Patient Lookup & Model Routing ---
    patient_info = all_patients_df[all_patients_df['patient_id'] == patient_id]
    if patient_info.empty:
        return "Error: Patient ID not found."
        
    preference = patient_info['cultural_preferences'].iloc[0]
    model_to_use = None
    model_name = ""

    # Determine which trained model to use based on the patient's preference
    if preference == 'Culturally Similar Provider; Same Language Provider':
        model_name = 'Culturally Similar Provider; Same Language Provider' # Match the exact key from training
    elif preference == 'Culturally Similar Provider':
        model_name = 'Culturally Similar Provider'
    elif preference == 'Same Language Provider':
        model_name = 'Same Language Provider'
    else: # Fallback for any other preferences, like just "Same Language Provider"
        model_name = 'No Specific Preference'

    model_to_use = models_dict.get(model_name)
    print(f"Patient preference: '{preference}'. Routing to '{model_name}' model.")
        
    if not model_to_use:
        return f"Error: Model for preference group '{model_name}' was not trained (likely due to small size)."


    # --- Step 2: Candidate Generation (Hard Filters) ---
    candidate_providers_accepting_new = all_providers_df[all_providers_df['accepts_new_patients'] == True]
    previous_provider=encounter_df[encounter_df['patient_id']==patient_id]['provider_id'].unique()
    candidate_providers_not_accepting_new = all_providers_df[
        (all_providers_df['accepts_new_patients'] == False) &
        (all_providers_df['provider_id'].isin(previous_provider))
    ]
    
    candidate_providers_all=pd.concat([candidate_providers_accepting_new,candidate_providers_not_accepting_new])

    candidate_providers = candidate_providers_all[
        (candidate_providers_all['specialty'] == required_specialty)         
    ].copy()
    if candidate_providers.empty:
        print(f"No providers found for specialty '{required_specialty}' ")
        candidate_providers = candidate_providers_all

    # --- Step 3: Feature Engineering for Inference ---
    # Merge all necessary info for the patient and candidate providers
    inference_df = candidate_providers.assign(key=1).merge(patient_info.assign(key=1), on='key').drop('key', axis=1)
    inference_df = pd.merge(inference_df, all_hospitals_df, left_on='hospital_affiliation', right_on='hospital_id', how='left')

    print(f'inference_df columns: {inference_df.columns.tolist()}')

    # Re-create the exact same features used in training
    inference_df.rename(columns={'cultural_competency_rating_y': 'cultural_competency_rating_prov'}, inplace=True)
    inference_df['race_match'] = (inference_df['race'] == inference_df['provider_race']).astype(int)
    inference_df['ethnicity_match'] = (inference_df['ethnicity'] == inference_df['provider_ethnicity']).astype(int)

    # create language match if preferred_provider_language in languages_spoken (semicolon-separated string)
    def language_match_func(row):
        if pd.isna(row['languages_spoken']) or pd.isna(row['preferred_provider_language']):
            return 0
        spoken = [lang.strip() for lang in str(row['languages_spoken']).split(';')]
        return 1 if row['preferred_provider_language'] in spoken else 0

    inference_df['language_match'] = inference_df.apply(language_match_func, axis=1)

    # Geographic Feature - CRITICAL: Use the same scaling as the training data
    dist = pgeocode.GeoDistance('US')
    inference_df['distance_km'] = dist.query_postal_code(
        inference_df['zip_code_x'].astype(str).tolist(), 
        inference_df['zip_code_y'].astype(str).tolist()
    )
    # Impute missing distances using the same logic (specialty mean, then global mean)
    mean_dist_by_specialty = inference_df.groupby('specialty')['distance_km'].transform('mean')
    inference_df['distance_km'].fillna(mean_dist_by_specialty, inplace=True)
    inference_df['distance_km'].fillna(training_max_dist / 2, inplace=True) # Fallback with a reasonable value

    inference_df['proximity_score'] = 1 - (
        (inference_df['distance_km'] - training_min_dist) / (training_max_dist - training_min_dist)
    )
    # Clip scores to be between 0 and 1, in case a new distance is outside the training range
    inference_df['proximity_score'] = inference_df['proximity_score'].clip(0, 1)

    inference_df['cultural_competency_rating_prov'] = inference_df['cultural_competency_rating']
    # --- Step 4: Predict Scores ---
    # Ensure the feature list matches the one used for training
    features = [
        'years_experience', 'cultural_competency_rating_prov', 'communication_rating',
        'race_match', 'ethnicity_match', 'language_match', 'proximity_score', 
        'interpreter_services_24_7'
    ]
    X_inference = inference_df[features]
    predicted_scores = model_to_use.predict(X_inference)
    inference_df['predicted_success_score'] = predicted_scores

    # --- Step 5: Rank and Return ---
    recommendations = inference_df.sort_values(by='predicted_success_score', ascending=False)

    print("--- Recommendations Generated ---")
   # print("--- Recommendations Generated ---")
    return recommendations[['provider_id', 'first_name_x', 'last_name_x', 'specialty', 'hospital_name','distance_km', 'predicted_success_score']]



In [1]:
# --- Example Usage ---
# First, get the min and max distance from your *full* training data to pass to the function
min_dist_training = master_df['distance_km'].min()
max_dist_training = master_df['distance_km'].max()

# Replace with a real patient_id and specialty from your data
example_patient_id = 'PAT_046599' # Replace with a valid ID
example_specialty = 'Cardiology' # Replace with a valid specialty

# Ensure that there are models trained before running this
if trained_models:
    final_recommendations = get_recommendations(
        patient_id=example_patient_id,
        required_specialty=example_specialty,
        all_providers_df=provider_df,
        all_patients_df=patient_df,
        all_hospitals_df=hospital_df,
        models_dict=trained_models,
        training_min_dist=min_dist_training,
        training_max_dist=max_dist_training
    )
    display(final_recommendations.head(5))
    
else:
    print("No models were trained, cannot generate recommendations.")




NameError: name 'master_df' is not defined