## Import Statementes


In [None]:
# Standard library imports
import sys

# Third-party imports
import pandas as pd
import numpy as np
import importlib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score

# Local application imports
sys.path.append('../')
import src.utils
importlib.reload(src.utils)
from src.utils import SaveModelMetrics

# Other settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


## Read in Small Group Dataset

These are the fields from the datasets that we want

- Census Data
  - census_carrier_name
  - plan_admin_name
- Ideon Data
  - id
  - carrier_name
  - name
  - plan_type
  - level
  - primary_care_physician
  - network_name
  - coinsurance
  - issuer_plan_code
  - hsa_eligible
  - individual_medical_deductible
- Evaluation Class
  - is_match

In [None]:
small_group_df = pd.read_csv(
    '~/match-plans/data/small_group_dataset.csv', index_col=False)
full_df = small_group_df[['id', 'census_carrier_name', 'plan_admin_name', 'carrier_name', 'name', 'plan_type', 'level', 'primary_care_physician','network_name', 'coinsurance', 'issuer_plan_code', 'hsa_eligible', 'individual_medical_deductible', 'is_match']]


## Naive Model

* This model is meant to be extremely simple; take the information from the census and take the information from ideon that defines carriers and plan name, and jam it together in a text embedding
* Let us see how far we can get without any sort of feature engineering at all
* We aren't going to do much hyperparameter tuning on this model, just a few folds and a few parameters

In [None]:
# naive df
naive_df = full_df[['census_carrier_name', 'carrier_name','plan_admin_name','name', 'is_match']]

In [None]:
# Combine the text columns into a single string
naive_df['text'] = naive_df['census_carrier_name'] + ' ' + naive_df['carrier_name'] + \
    ' ' + naive_df['plan_admin_name'] + ' ' + naive_df['name']

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=500)

# Fitting the model and transforming the data
X = vectorizer.fit_transform(naive_df['text']).toarray()
y = naive_df['is_match']


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = XGBClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
}

# Create GridSearchCV object
grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=1,
)

# Fit the GridSearchCV object to the data
grid.fit(X_train, y_train)

# Print the best parameters
print('Best parameters found: ', grid.best_params_)
print('All parameters: ',grid.best_estimator_.get_params())

# Use the best model to make predictions
y_pred = grid.best_estimator_.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')
SaveModelMetrics(file_path='~/match-plans/data/model_results.csv').save_metrics(
    name='naive_model',
    description='A naive model that only concantenates the census information and matching ideon information to try and predict is_match using text embeddings',
    algorithm='XGBoostClassifier',
    best_params_grid_search=grid.best_params_,
    params=grid.best_estimator_.get_params(),
    accuracy_score=accuracy,
    precision_score=precision,
    recall=recall,
    f1_score=f1,
    conf_matrix=conf_matrix
)