In [None]:
import pandas as pd
import re
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns',None)

In [None]:
raw_plans = pd.read_csv('~/like-plans/data/raw/raw_plans.csv')
clean_plans = pd.read_csv('~/like-plans/data/processed/plans.csv')

In [None]:
clean_plans[clean_plans['plan_type'] == 'EPO'].head(5)

In [None]:
raw_plans['network_name'].unique()

In [None]:
raw_plans.groupby('network_name')['carrier_name'].nunique()

In [None]:
networks = raw_plans[['carrier_name', 'network_name','network_size']].drop_duplicates().copy()

In [None]:
import numpy as np
networks.rename(columns={'network_size':'ideon_network_size'},inplace=True)
networks['ribbon_network_id'] = np.nan

In [None]:
network_csv = pd.read_csv('/Users/kieranshaw/like-plans/data/external/networks/sample.csv')

In [None]:
network_csv['ribbon_id'] = np.nan

In [None]:
network_csv.to_csv('~/like-plans/data/external/networks/sample.csv',index=False)

In [None]:
df_model = clean_plans[[
    'level',
    'plan_type',
    'hsa_eligible',
    'infertility_treatment_rider',
    'individual_medical_deductible_in_network',
    # 'individual_medical_deductible_out_of_network',
    'family_medical_deductible_in_network',
    # 'family_medical_deductible_out_of_network',
    'individual_medical_moop_in_network',
    # 'individual_medical_moop_out_of_network',
    'family_medical_moop_in_network',
    # 'family_medical_moop_out_of_network',
    'coinsurance_in_network',
    # 'coinsurance_out_of_network'
]].copy()

In [None]:
# if the coinsurance is not applicable, lets change it to 0
df_model["coinsurance_in_network"] = df_model["coinsurance_in_network"].fillna(0)

# categorical columns
categorical_cols = [col for col in df_model.columns if df_model[col].dtype == "object"]

# boolean columns
boolean_columns = [col for col in df_model.columns if df_model[col].dtype == "bool"]

# numerical columns
numerical_cols = [col for col in df_model.columns if df_model[col].dtype in ["int64", "float64"]]

# Preprocessing for categorical data and numerical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('bool', MinMaxScaler(feature_range=(0, 1)), boolean_columns),
    ]
)

df_transformed = preprocessor.fit_transform(df_model)
knn = NearestNeighbors(n_neighbors=100, metric='cosine')
knn.fit(df_transformed)

In [None]:
# Select a random plan and transform it
random_plan = df_model.sample(n=1)
random_plan_transformed = preprocessor.transform(random_plan)

# Find nearest neighbors
distances, indices = knn.kneighbors(random_plan_transformed)

# Retrieve nearest plans (indices) and distances
nearest_plans_indices = indices[0]
distances_df = pd.DataFrame(distances.transpose(), columns=['similarity_score'])

# Joining the nearest plans with clean_plans
nearest_plans_full_info = clean_plans.iloc[nearest_plans_indices].copy()
nearest_plans_full_info.reset_index(drop=True, inplace=True)
distances_df.reset_index(drop=True, inplace=True)

# Adding the similarity scores
nearest_plans_full_info = pd.concat([nearest_plans_full_info, distances_df], axis=1)
nearest_plans_full_info[(nearest_plans_full_info['carrier_name'] != 'CalChoice') & (nearest_plans_full_info['carrier_name'] != 'CoveredCA') & (nearest_plans_full_info['similarity_score'] != 0)].head(10)

In [None]:
nearest_plans_full_info.head93)