In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import subprocess
import sys
from sklearn.preprocessing import MinMaxScaler

# Install pgeocode for geographic distance calculation
try:
    import pgeocode
except ImportError:
    print("Installing pgeocode...")
    subprocess.run([sys.executable, "-m", "pip", "install", "pgeocode"], check=True)
    import pgeocode

try:
    from ethnicolr import census_ln
except ImportError:
    print("Installing ethnicolr...")
    # Note: ethnicolr requires TensorFlow. This installation might take a moment.
    subprocess.run([sys.executable, "-m", "pip", "install", "ethnicolr"], check=True)
    from ethnicolr import census_ln

import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
#file locations
parquet_file_paths={
    "patient": r"Client_Data_files\Parquets\synthetic_patients.parquet",
    "encounter": r"Client_Data_files\Parquets\synthetic_encounters.parquet",
    "hospitals": r"Client_Data_files\Parquets\synthetic_hospitals.parquet",
    "provider": r"Client_Data_files\Parquets\synthetic_providers.parquet",    
}

# Reading the parquet files
patient_df = pd.read_parquet(parquet_file_paths['patient'])
encounter_df = pd.read_parquet(parquet_file_paths['encounter'])
hospital_df = pd.read_parquet(parquet_file_paths['hospitals'])
provider_df = pd.read_parquet(parquet_file_paths['provider'])

print("Dataframes loaded successfully.")
print(f"Patient DF shape: {patient_df.shape}")
print(f"Encounter DF shape: {encounter_df.shape}")
print(f"Provider DF shape: {provider_df.shape}")
print(f"Hospital DF shape: {hospital_df.shape}")

Dataframes loaded successfully.
Patient DF shape: (100000, 16)
Encounter DF shape: (200000, 17)
Provider DF shape: (5000, 19)
Hospital DF shape: (200, 14)


In [3]:
# Creating a data map for easy access
data_map={
    "patient": patient_df,
    "encounter": encounter_df,
    "hospitals": hospital_df,
    "provider": provider_df
}

for key, df in data_map.items():
    print(f"Dataframe: {key}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")    
    for col in df.columns:
        if df[col].isna().sum() > 0:
            print(f"Column '{col}' has {df[col].isna().sum()} missing values.")
    print("")

Dataframe: patient
Shape: (100000, 16)
Columns: ['patient_id', 'first_name', 'last_name', 'date_of_birth', 'gender', 'race', 'ethnicity', 'primary_language', 'zip_code', 'insurance_type', 'household_income', 'education_level', 'age', 'cultural_background', 'preferred_provider_language', 'cultural_preferences']

Dataframe: encounter
Shape: (200000, 17)
Columns: ['encounter_id', 'patient_id', 'provider_id', 'encounter_date', 'encounter_type', 'primary_diagnosis', 'length_of_stay', 'total_cost', 'cultural_background', 'primary_language', 'languages_spoken', 'cultural_competency_rating', 'cultural_match_score', 'language_match', 'patient_satisfaction', 'treatment_adherence', 'return_visit_30_days']

Dataframe: hospitals
Shape: (200, 14)
Columns: ['hospital_id', 'hospital_name', 'hospital_type', 'zip_code', 'bed_count', 'teaching_hospital', 'trauma_center', 'language_services_available', 'cultural_competency_program', 'interpreter_services_24_7', 'community_health_programs', 'overall_rating

In [None]:
print("test print...")

test print...


In [18]:
patient_df_temp=pd.DataFrame()
patient_df_temp=patient_df[patient_df['race']=='Hispanic or Latino'][['patient_id','first_name','last_name','race','ethnicity']].copy()
patient_race_pred=census_ln(patient_df_temp, 'last_name')

2025-09-28 17:11:11,774 - INFO - Preserving 12965 duplicate rows based on column 'last_name'
2025-09-28 17:11:11,774 - INFO - Data filtering summary: 12997 → 12997 rows (kept 100.0%)


2025-09-28 17:11:11,783 - INFO - Merging demographic data for 12997 records...
2025-09-28 17:11:11,847 - INFO - Matched 12997 of 12997 rows (100.0%)
2025-09-28 17:11:11,848 - INFO - Added columns: pct2prace, pctaian, pctapi, pctblack, pcthispanic, pctwhite


In [19]:
race_mapping={
    'white': 'White',
    'black': 'Black or African American',
    'api': 'Asian',    
    'aian': 'Native American',
    '2prace': 'Other'
}

In [20]:
race_cols=['pctwhite','pctblack','pctapi','pctaian','pct2prace']
patient_race_pred['derived_race'] = patient_race_pred[race_cols].idxmax(axis=1).str.replace('pct', '').map(race_mapping)

# Create a mapping from patient_id to derived_race
id_to_derived_race = dict(zip(patient_race_pred['patient_id'], patient_race_pred['derived_race']))

# Update the race column only for Hispanic or Latino patients
patient_df.loc[patient_df['race'] == 'Hispanic or Latino', 'race'] = \
    patient_df.loc[patient_df['race'] == 'Hispanic or Latino', 'patient_id'].map(id_to_derived_race)

In [21]:
provider_race_predictions = census_ln(provider_df, 'last_name')

# Derive race for the provider_df as it is missing from the source data
print("Deriving race for providers from last names...")
race_cols = ['pctwhite','pctblack','pctapi','pctaian','pct2prace']
provider_df['provider_race'] = provider_race_predictions[race_cols].idxmax(axis=1).str.replace('pct', '').map(race_mapping)
print("Provider race derivation complete.")

# Deriving provider ethnicity from the race_predictions
print("Deriving ethnicity for providers from race predictions...")
provider_df['provider_ethnicity'] = provider_race_predictions['pcthispanic'].apply(lambda x: 'Hispanic or Latino' if float(x) >= 50 else 'Not Hispanic or Latino')
print("Provider ethnicity derivation complete.")


2025-09-28 17:11:56,058 - INFO - Preserving 4968 duplicate rows based on column 'last_name'
2025-09-28 17:11:56,059 - INFO - Data filtering summary: 5000 → 5000 rows (kept 100.0%)
2025-09-28 17:11:56,061 - INFO - Merging demographic data for 5000 records...
2025-09-28 17:11:56,132 - INFO - Matched 5000 of 5000 rows (100.0%)
2025-09-28 17:11:56,132 - INFO - Added columns: pct2prace, pctaian, pctapi, pctblack, pcthispanic, pctwhite


Deriving race for providers from last names...
Provider race derivation complete.
Deriving ethnicity for providers from race predictions...
Provider ethnicity derivation complete.


In [22]:
for key, df in data_map.items():
    print(f"Dataframe: {key}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")    
    for col in df.columns:
        if df[col].isna().sum() > 0:
            print(f"Column '{col}' has {df[col].isna().sum()} missing values.")
    print("")

Dataframe: patient
Shape: (100000, 16)
Columns: ['patient_id', 'first_name', 'last_name', 'date_of_birth', 'gender', 'race', 'ethnicity', 'primary_language', 'zip_code', 'insurance_type', 'household_income', 'education_level', 'age', 'cultural_background', 'preferred_provider_language', 'cultural_preferences']

Dataframe: encounter
Shape: (200000, 17)
Columns: ['encounter_id', 'patient_id', 'provider_id', 'encounter_date', 'encounter_type', 'primary_diagnosis', 'length_of_stay', 'total_cost', 'cultural_background', 'primary_language', 'languages_spoken', 'cultural_competency_rating', 'cultural_match_score', 'language_match', 'patient_satisfaction', 'treatment_adherence', 'return_visit_30_days']

Dataframe: hospitals
Shape: (200, 14)
Columns: ['hospital_id', 'hospital_name', 'hospital_type', 'zip_code', 'bed_count', 'teaching_hospital', 'trauma_center', 'language_services_available', 'cultural_competency_program', 'interpreter_services_24_7', 'community_health_programs', 'overall_rating

### Step 2: Feature Engineering

Here, we merge the datasets and create the features our model will learn from. This includes cultural matches, language matches, and geographic distance.

In [23]:
# Merge all data into a single master DataFrame for training
master_df = pd.merge(encounter_df, patient_df, on='patient_id',suffixes=('', '_pat'))
master_df = pd.merge(master_df, provider_df, on='provider_id',suffixes=('', '_prov'))
master_df = pd.merge(master_df, hospital_df, left_on='hospital_affiliation', right_on='hospital_id',suffixes=('', '_hosp'))

print("Master DataFrame created with shape:", master_df.shape)

Master DataFrame created with shape: (200000, 66)


In [24]:
master_df.columns

Index(['encounter_id', 'patient_id', 'provider_id', 'encounter_date',
       'encounter_type', 'primary_diagnosis', 'length_of_stay', 'total_cost',
       'cultural_background', 'primary_language', 'languages_spoken',
       'cultural_competency_rating', 'cultural_match_score', 'language_match',
       'patient_satisfaction', 'treatment_adherence', 'return_visit_30_days',
       'first_name', 'last_name', 'date_of_birth', 'gender', 'race',
       'ethnicity', 'primary_language_pat', 'zip_code', 'insurance_type',
       'household_income', 'education_level', 'age', 'cultural_background_pat',
       'preferred_provider_language', 'cultural_preferences', 'npi_number',
       'first_name_prov', 'last_name_prov', 'specialty', 'practice_zip_code',
       'years_experience', 'medical_school_country', 'board_certified',
       'languages_spoken_prov', 'interpreter_services',
       'cultural_certifications', 'minority_health_experience',
       'community_involvement', 'patient_satisfaction_sc

In [None]:
master_df['language_match']

0          True
1          True
2          True
3          True
4          True
          ...  
199995     True
199996     True
199997    False
199998     True
199999     True
Name: language_match, Length: 200000, dtype: bool

In [25]:

# --- Engineer the Match Features ---

# Cultural Features
master_df['race_match'] = (master_df['race'] == master_df['provider_race']).astype(int)
master_df['ethnicity_match'] = (master_df['ethnicity'] == master_df['provider_ethnicity']).astype(int)

# Language Feature
# # The 'languages_spoken' column is a string like '["English", "Spanish"]'. We need to parse it.
# import ast
# master_df['languages_spoken_list'] = master_df['languages_spoken'].apply(ast.literal_eval)
master_df['language_match'] = (master_df['language_match'] == True).astype(int)

# Geographic Feature
dist = pgeocode.GeoDistance('US') # Assuming US zip codes
# Calculate distance between patient and provider zip codes
master_df['distance_km'] = dist.query_postal_code(
    master_df['zip_code'].astype(str).tolist(), 
    master_df['zip_code_hosp'].astype(str).tolist()
)
# Calculate the mean distance for each provider specialty
# The .transform('mean') creates a Series with the same index as master_df,
mean_dist_by_specialty = master_df.groupby('specialty')['distance_km'].transform('mean')

# Now, fill the missing distances using these specialty-specific averages
master_df['distance_km'].fillna(mean_dist_by_specialty, inplace=True)

# If any specialties had NO valid distances, there might still be NaNs.
# Fill any remaining with the overall mean as a final fallback.
master_df['distance_km'].fillna(master_df['distance_km'].mean(), inplace=True)

# 1. First, calculate the min and max distances from data
min_distance = master_df['distance_km'].min()
max_distance = master_df['distance_km'].max()

# 2. Apply the scaling formula to create a score from 0 to 1
# The formula is: 1 - ( (x - min) / (max - min) )

master_df['proximity_score'] = 1 - (
    (master_df['distance_km'] - min_distance) / (max_distance - min_distance)
)



print("Feature engineering complete.")
master_df[['race_match', 'ethnicity_match', 'language_match', 'distance_km','proximity_score']].head()

Feature engineering complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  master_df['distance_km'].fillna(mean_dist_by_specialty, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  master_df['distance_km'].fillna(master_df['distance_km'].mean(), inplace=True)


Unnamed: 0,race_match,ethnicity_match,language_match,distance_km,proximity_score
0,1,1,1,653.678558,0.959939
1,0,0,1,1869.879967,0.885402
2,0,1,1,1792.544564,0.890142
3,1,1,1,983.320368,0.939736
4,1,1,1,444.378187,0.972766


In [26]:
print(master_df['proximity_score'].isna().sum())
master_df['proximity_score'].describe()

0


count    200000.000000
mean          0.892787
std           0.034057
min           0.000000
25%           0.890142
50%           0.893621
75%           0.894823
max           1.000000
Name: proximity_score, dtype: float64

In [None]:
#master_df.to_csv('master_df_v1_5.csv', index=False)

### Step 3: Training and Testing the Model

This is the core machine learning section. We split our data, train the model, and then test it on unseen data to validate its performance. The feature importances are the **learned weights**.

In [27]:
# --- Training Section ---

# 1. Normalize the columns to a 0-1 scale
scaler = MinMaxScaler()
master_df[['satisfaction_norm', 'adherence_norm']] = scaler.fit_transform(
    master_df[['patient_satisfaction', 'treatment_adherence']]
)

# 2. Define weights and create the composite score
adherence_weight = 0.5
satisfaction_weight = 0.5
master_df['success_score'] = (
    master_df['adherence_norm'] * adherence_weight +
    master_df['satisfaction_norm'] * satisfaction_weight
)

# Rename provider competency column to avoid conflict
master_df.rename(columns={'cultural_competency_rating_y': 'cultural_competency_rating_prov'}, inplace=True)

# Define the features to be used by the models
features = [
    'years_experience',
    'cultural_competency_rating_prov',
    'communication_rating',
    'race_match',
    'ethnicity_match',
    'language_match',
    'proximity_score',
    'interpreter_services_24_7'
]
target = 'success_score'

# Get the unique preference categories to loop through
unique_preferences = master_df['cultural_preferences'].unique()
print("Unique cultural preferences found: ", unique_preferences)



Unique cultural preferences found:  ['No Specific Preference' 'Culturally Similar Provider'
 'Culturally Similar Provider; Same Language Provider'
 'Same Language Provider']


In [None]:
# A dictionary to store a trained model for each preference type
trained_models = {}
learned_weights_dict = {}
best_hyperparameters = {} 
test_metrics_dict = {}

for i, preference in enumerate(unique_preferences):
    print(f"Training model for cultural preference: {preference} ({i+1}/{len(unique_preferences)})--------------------------------------")
    
    # Filter the DataFrame for the current preference
    segment_df= master_df[master_df['cultural_preferences'] == preference].copy()

    # Check if the segment is large enough to train a model
    if len(segment_df) < 100: # You can adjust this threshold
        print(f"Segment is too small to train a reliable model. Skipping.\n")
        continue

    # Split the data into training and testing sets
    X_segment = segment_df[features]
    y_segment = segment_df[target]

    # 1. Split data into a training+validation set (80%) and a final test set (20%)
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X_segment, y_segment, test_size=0.2, random_state=42
    )

    # --- Hyperparameter Tuning Section (using the validation data implicitly via CV) ---

    # 2. Define the grid of hyperparameters to search
    param_grid = {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [10, 20, 30, None],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', 1.0]
    }

    # 3. Set up Randomized Search with 5-fold cross-validation
    rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    random_search = RandomizedSearchCV(
        estimator=rf, 
        param_distributions=param_grid, 
        n_iter=50,  # Number of parameter settings that are sampled
        cv=5,       # 5-fold cross-validation
        verbose=0,  # Set to 1 or 2 for more detailed output
        random_state=42, 
        scoring='neg_mean_squared_error' # We use negative MSE for optimization
    )

    print(f'Starting hyperparameter tuning on {len(X_train_val)} samples...')
    # 4. Fit the random search to the training+validation data
    random_search.fit(X_train_val, y_train_val)

    print(f'Tuning complete. Best parameters found:')
    print(random_search.best_params_)

    # 5. Get the best model found during the search
    best_hyperparameters[preference] = random_search.best_params_ # <-- NEW LINE: Store the best params
    best_model = random_search.best_estimator_

    # --- Final Testing Section ---

    print("\n--- Final Evaluation on the Held-Out Test Set ---")
    print(f"Test set size: {X_test.shape[0]} encounters")
    
    # 6. Make predictions on the unseen test data
    final_predictions = best_model.predict(X_test)

    # 7. Evaluate the final model's performance
    rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
    mae = mean_absolute_error(y_test, final_predictions)
    r2 = r2_score(y_test, final_predictions)

    print("\n--- Final Model Validation Metrics ---")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE):     {mae:.4f}")
    print(f"R-squared (R²):                {r2:.4f}")

    # store the test run results 
    # You can store these metrics in a dictionary or DataFrame if needed
    test_metrics = {
        'preference': preference,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'test_set_size': X_test.shape[0]
    }

    # 8. Inspect and store the results from the best model
    learned_weights = pd.Series(best_model.feature_importances_, index=features).sort_values(ascending=False)
    print("\n--- Learned Feature Weights for this Segment ---")
    print(learned_weights)

    test_metrics_dict[preference] = test_metrics
    trained_models[preference] = best_model
    learned_weights_dict[preference] = learned_weights
    print(f"\n--- Best model for '{preference}' trained and stored ----------------------------------------------------------------")


Training model for cultural preference: No Specific Preference (1/4)--------------------------------------
Starting hyperparameter tuning on 69151 samples...
Tuning complete. Best parameters found:
{'n_estimators': 200, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10}

--- Final Evaluation on the Held-Out Test Set ---
Test set size: 17288 encounters

--- Final Model Validation Metrics ---
Root Mean Squared Error (RMSE): 0.1095
Mean Absolute Error (MAE):     0.0906
R-squared (R²):                -0.0002

--- Learned Feature Weights for this Segment ---
cultural_competency_rating_prov    0.302281
proximity_score                    0.258730
communication_rating               0.227062
years_experience                   0.150782
interpreter_services_24_7          0.023370
race_match                         0.019298
ethnicity_match                    0.018477
language_match                     0.000000
dtype: float64

--- Best model for 'No Specific Preference' trained and sto

In [None]:
weights_df = pd.DataFrame(learned_weights_dict).reset_index(drop=False).rename(columns={'index': 'feature'})
hyperparams_df = pd.DataFrame(best_hyperparameters)


Unnamed: 0,No Specific Preference,Culturally Similar Provider,Culturally Similar Provider; Same Language Provider,Same Language Provider
n_estimators,200,150,200,200
min_samples_leaf,4,4,4,4
max_features,log2,sqrt,log2,log2
max_depth,10,10,10,10


In [None]:
print(f"Total models trained and stored: {len(trained_models)}")
print(f'learned_weights_dict keys: {learned_weights_dict.keys()}')

Total models trained and stored: 4
learned_weights_dict keys: dict_keys(['No Specific Preference', 'Culturally Similar Provider', 'Culturally Similar Provider; Same Language Provider', 'Same Language Provider'])


In [None]:
# # --- Training Section ---

# # 1. Normalize the columns to a 0-1 scale
# scaler = MinMaxScaler()
# master_df[['satisfaction_norm', 'adherence_norm']] = scaler.fit_transform(
#     master_df[['patient_satisfaction', 'treatment_adherence']]
# )

# # 2. Define weights and create the composite score
# adherence_weight = 0.5
# satisfaction_weight = 0.5
# master_df['success_score'] = (
#     master_df['adherence_norm'] * adherence_weight +
#     master_df['satisfaction_norm'] * satisfaction_weight
# )

# # Rename provider competency column to avoid conflict
# master_df.rename(columns={'cultural_competency_rating_y': 'cultural_competency_rating_prov'}, inplace=True)

# # Define the features to be used by the models
# features = [
#     'years_experience',
#     'cultural_competency_rating_prov',
#     'communication_rating',
#     'race_match',
#     'ethnicity_match',
#     'language_match',
#     'proximity_score',
#     'interpreter_services_24_7'
# ]
# target = 'success_score'

# # --- Segmented Training and Testing ---

# # Get the unique preference categories to loop through
# unique_preferences = master_df['cultural_preferences'].unique()
# print("Unique cultural preferences found: ", unique_preferences)

# # A dictionary to store a trained model for each preference type
# trained_models = {}
# learned_weights_dict = {}

# i=1
 
# for preference in unique_preferences:
#     print(f'preference segment {i} of {len(unique_preferences)}')
#     print(f"Processing Segment: '{preference}' ---")
    
#     # Create a subset of the data for the current preference type
#     segment_df = master_df[master_df['cultural_preferences'] == preference].copy()
    
#     # Check if the segment is large enough to train a model
#     if len(segment_df) < 50: # You can adjust this threshold
#         print(f"Segment is too small to train a reliable model. Skipping.\n")
#         continue

#     # --- Training Section for the Segment ---
    
#     X_segment = segment_df[features]
#     y_segment = segment_df[target]

#     # 1. Split the segment's data into a training set (80%) and a testing set (20%)
#     X_train, X_test, y_train, y_test = train_test_split(X_segment, y_segment, test_size=0.2, random_state=42)

#     print(f"Training set size: {X_train.shape[0]} encounters")
#     print(f"Testing set size: {X_test.shape[0]} encounters")

#     # 2. Initialize and train a new Random Forest model for this segment
#     model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
#     model.fit(X_train, y_train)

#     # --- Testing Section for the Segment ---

#     # 3. Make predictions on the unseen test data for this segment
#     predictions = model.predict(X_test)

#     # 4. Evaluate the model's performance
#     rmse = np.sqrt(mean_squared_error(y_test, predictions))
#     mae = mean_absolute_error(y_test, predictions)
#     r2 = r2_score(y_test, predictions)

#     print("\n--- Model Validation Metrics ---")
#     print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
#     print(f"Mean Absolute Error (MAE):     {mae:.4f}")
#     print(f"R-squared (R²):                {r2:.4f}")

#     # 5. Inspect the 'Learned Weights' (Feature Importances) for this specific model
#     learned_weights = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
#     print("\n--- Learned Feature Weights for this Segment ---")
#     print(learned_weights)
    
#     # 6. Store the trained model
#     trained_models[preference] = model
#     learned_weights_dict[preference] = learned_weights
#     print(f"--- Model for '{preference}' trained and stored ---\n")
#     i=i+1

Unique cultural preferences found:  ['No Specific Preference' 'Culturally Similar Provider'
 'Culturally Similar Provider; Same Language Provider'
 'Same Language Provider']
preference segment 1 of 4
Processing Segment: 'No Specific Preference' ---
Training set size: 69151 encounters
Testing set size: 17288 encounters
Training set size: 69151 encounters
Testing set size: 17288 encounters

--- Model Validation Metrics ---
Root Mean Squared Error (RMSE): 0.1151
Mean Absolute Error (MAE):     0.0946
R-squared (R²):                -0.1067

--- Learned Feature Weights for this Segment ---
proximity_score                    0.386974
communication_rating               0.230190
cultural_competency_rating_prov    0.164537
years_experience                   0.143111
interpreter_services_24_7          0.027562
race_match                         0.025218
ethnicity_match                    0.022407
language_match                     0.000000
dtype: float64
--- Model for 'No Specific Preference' tra

In [None]:
# convert dictionary to dataframe
learned_weights_df = pd.DataFrame(learned_weights)
learned_weights_df.index.name = 'cultural_preferences'

In [None]:
for key in learned_weights_dict.keys():
    print(f"Preference: {key}")
    print(learned_weights_dict[key].tolist())


Preference: No Specific Preference
[0.386974489821484, 0.23019002327162547, 0.16453684159692655, 0.14311135388525828, 0.02756165400768089, 0.025218383833930574, 0.022407253583094255, 0.0]
Preference: Culturally Similar Provider
[0.23638293508171723, 0.22989727272492963, 0.16519773576037672, 0.15182700676172423, 0.09477007405746572, 0.09317003986773503, 0.028754935746051466, 0.0]
Preference: Culturally Similar Provider; Same Language Provider
[0.4051664254176326, 0.15620424810482433, 0.11830900818510516, 0.11493618900534715, 0.09795053882928924, 0.04520594833335658, 0.043516902229710705, 0.01871073989473434]
Preference: Same Language Provider
[0.5208611621416773, 0.13212231087597104, 0.1289808596215443, 0.09722874995700181, 0.0820571080118914, 0.014511249376409916, 0.012325471480068885, 0.011913088535435318]


# Convert learned_weights dictionary to a DataFrame
learned_weights_df = pd.DataFrame(learned_weights)
learned_weights_df.index.name = 'feature'
display(learned_weights_df)mendations for a new patient.

In [None]:
def get_recommendations(patient_id, required_specialty, 
                        # DataFrames
                        all_providers_df, 
                        all_patients_df, 
                        all_hospitals_df,
                        # Trained Models & Scaling Data
                        models_dict,
                        training_min_dist, 
                        training_max_dist):
    """
    Generates ranked doctor recommendations using the segmented model strategy.
    
    Args:
        patient_id (int): The ID of the patient seeking a recommendation.
        required_specialty (str): The medical specialty required.
        all_providers_df (pd.DataFrame): The full provider dataframe.
        all_patients_df (pd.DataFrame): The full patient dataframe.
        all_hospitals_df (pd.DataFrame): The full hospital dataframe.
        models_dict (dict): The dictionary of trained models for each preference segment.
        training_min_dist (float): The minimum distance calculated from the full training set.
        training_max_dist (float): The maximum distance calculated from the full training set.
        
    Returns:
        pd.DataFrame: A ranked dataframe of recommended providers.
    """
    print(f"\n--- Starting Recommendation Phase for Patient ID: {patient_id} ---")
    
    # --- Step 1: Patient Lookup & Model Routing ---
    patient_info = all_patients_df[all_patients_df['patient_id'] == patient_id]
    if patient_info.empty:
        return "Error: Patient ID not found."
        
    preference = patient_info['cultural_preferences'].iloc[0]
    model_to_use = None
    model_name = ""

    # Determine which trained model to use based on the patient's preference
    if preference == 'Culturally Similar Provider; Same Language Provider':
        model_name = 'Culturally Similar Provider; Same Language Provider' # Match the exact key from training
    elif preference == 'Culturally Similar Provider':
        model_name = 'Culturally Similar Provider'
    elif preference == 'Same Language Provider':
        model_name = 'Same Language Provider'
    else: # Fallback for any other preferences, like just "Same Language Provider"
        model_name = 'No Specific Preference'

    model_to_use = models_dict.get(model_name)
    print(f"Patient preference: '{preference}'. Routing to '{model_name}' model.")
        
    if not model_to_use:
        return f"Error: Model for preference group '{model_name}' was not trained (likely due to small size)."


    # --- Step 2: Candidate Generation (Hard Filters) ---
    candidate_providers_accepting_new = all_providers_df[all_providers_df['accepts_new_patients'] == True]
    previous_provider=encounter_df[encounter_df['patient_id']==patient_id]['provider_id'].unique()
    candidate_providers_not_accepting_new = all_providers_df[
        (all_providers_df['accepts_new_patients'] == False) &
        (all_providers_df['provider_id'].isin(previous_provider))
    ]
    
    candidate_providers_all=pd.concat([candidate_providers_accepting_new,candidate_providers_not_accepting_new])

    candidate_providers = candidate_providers_all[
        (candidate_providers_all['specialty'] == required_specialty)         
    ].copy()
    if candidate_providers.empty:
        print(f"No providers found for specialty '{required_specialty}' ")
        candidate_providers = candidate_providers_all

    # --- Step 3: Feature Engineering for Inference ---
    # Merge all necessary info for the patient and candidate providers
    inference_df = candidate_providers.assign(key=1).merge(patient_info.assign(key=1), on='key').drop('key', axis=1)
    inference_df = pd.merge(inference_df, all_hospitals_df, left_on='hospital_affiliation', right_on='hospital_id', how='left')

    print(f'inference_df columns: {inference_df.columns.tolist()}')

    # Re-create the exact same features used in training
    inference_df.rename(columns={'cultural_competency_rating_y': 'cultural_competency_rating_prov'}, inplace=True)
    inference_df['race_match'] = (inference_df['race'] == inference_df['provider_race']).astype(int)
    inference_df['ethnicity_match'] = (inference_df['ethnicity'] == inference_df['provider_ethnicity']).astype(int)

    # create language match if preferred_provider_language in languages_spoken (semicolon-separated string)
    def language_match_func(row):
        if pd.isna(row['languages_spoken']) or pd.isna(row['preferred_provider_language']):
            return 0
        spoken = [lang.strip() for lang in str(row['languages_spoken']).split(';')]
        return 1 if row['preferred_provider_language'] in spoken else 0

    inference_df['language_match'] = inference_df.apply(language_match_func, axis=1)

    # Geographic Feature - CRITICAL: Use the same scaling as the training data
    dist = pgeocode.GeoDistance('US')
    inference_df['distance_km'] = dist.query_postal_code(
        inference_df['zip_code_x'].astype(str).tolist(), 
        inference_df['zip_code_y'].astype(str).tolist()
    )
    # Impute missing distances using the same logic (specialty mean, then global mean)
    mean_dist_by_specialty = inference_df.groupby('specialty')['distance_km'].transform('mean')
    inference_df['distance_km'].fillna(mean_dist_by_specialty, inplace=True)
    inference_df['distance_km'].fillna(training_max_dist / 2, inplace=True) # Fallback with a reasonable value

    inference_df['proximity_score'] = 1 - (
        (inference_df['distance_km'] - training_min_dist) / (training_max_dist - training_min_dist)
    )
    # Clip scores to be between 0 and 1, in case a new distance is outside the training range
    inference_df['proximity_score'] = inference_df['proximity_score'].clip(0, 1)

    inference_df['cultural_competency_rating_prov'] = inference_df['cultural_competency_rating']
    # --- Step 4: Predict Scores ---
    # Ensure the feature list matches the one used for training
    features = [
        'years_experience', 'cultural_competency_rating_prov', 'communication_rating',
        'race_match', 'ethnicity_match', 'language_match', 'proximity_score', 
        'interpreter_services_24_7'
    ]
    X_inference = inference_df[features]
    predicted_scores = model_to_use.predict(X_inference)
    inference_df['predicted_success_score'] = predicted_scores

    # --- Step 5: Rank and Return ---
    recommendations = inference_df.sort_values(by='predicted_success_score', ascending=False)

    print("--- Recommendations Generated ---")
   # print("--- Recommendations Generated ---")
    return recommendations[['provider_id', 'first_name_x', 'last_name_x', 'specialty', 'hospital_name','distance_km', 'predicted_success_score']]



In [None]:
# --- Example Usage ---
# First, get the min and max distance from your *full* training data to pass to the function
min_dist_training = master_df['distance_km'].min()
max_dist_training = master_df['distance_km'].max()

# Replace with a real patient_id and specialty from your data
example_patient_id = 'PAT_000001' # Replace with a valid ID
example_specialty = 'Cardiology' # Replace with a valid specialty

# Ensure that there are models trained before running this
if trained_models:
    final_recommendations = get_recommendations(
        patient_id=example_patient_id,
        required_specialty=example_specialty,
        all_providers_df=provider_df,
        all_patients_df=patient_df,
        all_hospitals_df=hospital_df,
        models_dict=trained_models,
        training_min_dist=min_dist_training,
        training_max_dist=max_dist_training
    )
    display(final_recommendations.head(5))
    
else:
    print("No models were trained, cannot generate recommendations.")





--- Starting Recommendation Phase for Patient ID: PAT_000001 ---
Patient preference: 'No Specific Preference'. Routing to 'No Specific Preference' model.
inference_df columns: ['provider_id', 'npi_number', 'first_name_x', 'last_name_x', 'specialty', 'practice_zip_code', 'years_experience', 'medical_school_country', 'board_certified', 'languages_spoken', 'interpreter_services', 'cultural_certifications', 'minority_health_experience', 'community_involvement', 'patient_satisfaction_score', 'communication_rating', 'cultural_competency_rating', 'hospital_affiliation', 'accepts_new_patients', 'provider_race', 'provider_ethnicity', 'patient_id', 'first_name_y', 'last_name_y', 'date_of_birth', 'gender', 'race', 'ethnicity', 'primary_language', 'zip_code_x', 'insurance_type', 'household_income', 'education_level', 'age', 'cultural_background', 'preferred_provider_language', 'cultural_preferences', 'hospital_id', 'hospital_name', 'hospital_type', 'zip_code_y', 'bed_count', 'teaching_hospital', 

Unnamed: 0,provider_id,first_name_x,last_name_x,specialty,hospital_name,distance_km,predicted_success_score
143,PROV_02450,John,Hernandez,Cardiology,Metro Medical Center,8158.457169,0.905949
173,PROV_03111,Ahmed,Lee,Cardiology,General Medical Complex,8158.457169,0.89997
119,PROV_02174,Antonio,Brown,Cardiology,General Hospital,8158.457169,0.885381
203,PROV_03737,Sofia,Mohamed,Cardiology,Community Medical Complex,8158.457169,0.885328
121,PROV_02179,Wei,Lopez,Cardiology,Memorial Medical Center,8158.457169,0.883657
