In [2]:
import pandas as pd
import re
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows', None)

In [36]:
raw_plans = pd.read_csv('~/like-plans/data/raw/raw_plans.csv')
hmo_epo_plans = pd.read_csv('~/like-plans/data/processed/hmo_epo_plans.csv')
ppo_pos_plans = pd.read_csv('~/like-plans/data/processed/ppo_pos_plans.csv')

hmo_epo_plans.set_index("id", inplace=True)
ppo_pos_plans.set_index("id", inplace=True)

In [68]:
def build_model(df: pd.DataFrame):
    # detect numerical columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    # exclude these columns
    exclude_columns = ['carrier_name', 'name']

    # columns to pass in
    columns_for_model = [col for col in numerical_columns if col not in exclude_columns]

    preprocessor = ColumnTransformer(
        transformers=[
            # Assuming 'numerical_cols' and 'categorical_cols' are defined
            ('num', StandardScaler(), columns_for_model)
        ],
        remainder='passthrough'
    )
    df_transformed = preprocessor.fit_transform(df)
    model = NearestNeighbors(metric='cosine')
    model.fit(df_transformed)
    return model, preprocessor

model_hmo_epo, preprocessor_hmo_epo = build_model(hmo_epo_plans)
model_ppo_pos, preprocessor_ppo_pos = build_model(ppo_pos_plans)

ValueError: could not convert string to float: 'Anthem'

In [65]:
def get_feature_columns(df):
    # Detect numerical columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Exclude specific columns
    exclude_columns = ['carrier_name', 'name']

    # Columns to pass in
    feature_columns = [col for col in numerical_columns if col not in exclude_columns]
    
    return feature_columns

# Get feature columns for each DataFrame
feature_columns_hmo_epo = get_feature_columns(hmo_epo_plans)
feature_columns_ppo_pos = get_feature_columns(ppo_pos_plans)


def find_similar_plans(plan_id, df, model, preprocessor, feature_columns):
    # Look up the plan by its ID
    if plan_id in df.index:
        input_plan = df.loc[plan_id]
    else:
        raise ValueError("Plan ID not found in the dataset")

    # Exclude 'carrier_name' and 'name' from the input_plan for prediction
    input_plan_features = input_plan[feature_columns]

    # Prepare the input_plan for prediction
    input_plan_transformed = preprocessor.transform([input_plan_features])

    # Find similar plans
    distances, indices = model.kneighbors(input_plan_transformed)

    # Extract similar plans for each carrier in the dataset
    similar_plans = df.iloc[indices[0]]
    similar_plans_per_carrier = similar_plans.groupby('carrier_name').first()

    return similar_plans_per_carrier


In [66]:
input_plan = raw_plans.sample(n=1).iloc[0]

In [67]:
input_plan = raw_plans.sample(n=1).iloc[0]
plan_id = input_plan["id"]
if plan_id in hmo_epo_plans.index:
    similar_plans = find_similar_plans(plan_id, hmo_epo_plans, model_hmo_epo, preprocessor_hmo_epo, feature_columns_hmo_epo)
elif plan_id in ppo_pos_plans.index:
    similar_plans = find_similar_plans(plan_id, ppo_pos_plans, model_ppo_pos, preprocessor_ppo_pos, feature_columns_ppo_pos)
else:
    raise ValueError("Plan ID not found in either dataset")


ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [30]:
if "27603CA1210865" in hmo_epo_plans["id"].to_list():
    print('Yes')

Yes


In [29]:
hmo_epo_plans.head(5)

Unnamed: 0,id,carrier_name,name,hsa_eligible,infertility_treatment_rider,individual_medical_deductible_in_network,family_medical_deductible_in_network,individual_medical_moop_in_network,family_medical_moop_in_network,coinsurance_in_network,level_bronze,level_expanded_bronze,level_gold,level_platinum,level_silver,plan_type_EPO,plan_type_HMO
0,27603CA1210865,Anthem,Anthem Platinum HMO 0/30,0,0,0,0,2700,5400,0,0,0,0,1,0,0,1
1,27603CA1210884,Anthem,Anthem Platinum Select HMO 0/30,0,0,0,0,2700,5400,0,0,0,0,1,0,0,1
2,27603CA1211080,Anthem,Anthem Platinum Priority Select HMO 0/30,0,0,0,0,2700,5400,0,0,0,0,1,0,0,1
3,27603CA1210551,Anthem,Anthem Gold HMO 30,0,0,0,0,6600,13200,0,0,0,1,0,0,0,1
4,27603CA1210290,Anthem,Anthem Gold HMO 35,0,0,0,0,6750,13500,0,0,0,1,0,0,0,1
