In [1]:
import numpy as np
import pandas as pd

In [31]:
data = pd.read_csv("data_new.csv")

In [3]:
# To find how much missing data is in each column:
# Count the missing values in each column
missing_values = data.isnull().sum()

# Sum the total number of missing values in the entire DataFrame
total_missing_values = missing_values.sum()

# Print the results
print("Number of missing values in each column:")
print(missing_values)
print("\nTotal number of missing values in the DataFrame:", total_missing_values)

Number of missing values in each column:
id                     0
risk_score_t           0
program_enrolled_t     0
cost_t                 0
cost_avoidable_t       0
                      ..
trig_max-low_tm1       0
trig_max-high_tm1      0
trig_max-normal_tm1    0
gagne_sum_tm1          0
gagne_sum_t            0
Length: 161, dtype: int64

Total number of missing values in the DataFrame: 127857


In [None]:
data_drop = data.dropna()

If we drop all the NaN values, there will be an imbalanced number of white and black subjects, therefore, we will perform imputation.

In [11]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline



In [32]:
race_mapping = {'black': 0, 'white': 1}

# Apply the mapping to the 'Race' column
data['race'] = data['race'].map(race_mapping)
data_use = data.drop(columns = ["id","program_enrolled_t", "cost_t", "cost_avoidable_t", "cost_emergency_tm1", "cost_home_health_tm1", "cost_ip_medical_tm1", "cost_ip_surgical_tm1", "cost_laboratory_tm1", "cost_op_primary_care_tm1", "cost_op_specialists_tm1", "cost_op_surgery_tm1", "cost_other_tm1", "cost_pharmacy_tm1", "cost_physical_therapy_tm1", "cost_radiology_tm1"])

In [36]:
def evaluate_imputation(X, y, imputer, model):
    print(np.sum(np.isnan(X), axis = 0).sum())
    X_imputed = imputer.fit_transform(X)
    # To check that imputation worked
    missing_values = np.sum(np.isnan(X_imputed), axis=0)

    print("Number of missing values: ", missing_values.sum())
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [37]:
mean_imputer = SimpleImputer(strategy='mean')
mean_mse = evaluate_imputation(data_use.drop('risk_score_t', axis=1), data_use['risk_score_t'], mean_imputer, LinearRegression())
print(f"Mean Imputation MSE: {mean_mse}")

127857
Number of missing values:  0
Mean Imputation MSE: 14.202401501649728


In [38]:
# Median imputation
median_imputer = SimpleImputer(strategy='median')
median_mse = evaluate_imputation(data_use.drop('risk_score_t', axis=1), data_use['risk_score_t'], median_imputer, LinearRegression())
print(f"Median Imputation MSE: {median_mse}")

127857
Number of missing values:  0
Median Imputation MSE: 14.189681550319609


In [26]:
knn_imputer = KNNImputer(n_neighbors=5)
knn_mse = evaluate_imputation(data_use.drop('risk_score_t', axis=1), data_use['risk_score_t'], knn_imputer, LinearRegression())
print(f"KNN Imputation MSE: {knn_mse}")

176641
Number of missing values:  0
KNN Imputation MSE: 13.827733774434314


The KNN imputation strategy seems to have the lowest loss and provide the closest estimate. 


In [None]:
X_imputed = knn_imputer.fit_transform(data_use)
imputed_df = pd.DataFrame(X_imputed, columns = data_use.columns)

array([[  1.98742993, 127.33327212,   5.4       , ...,   0.        ,
          0.        ,   0.        ],
       [  7.67793443, 119.        ,   5.5       , ...,   1.        ,
          4.        ,   3.        ],
       [  0.40767793, 127.33327212,   5.95933067, ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [  1.35892645, 105.        ,   5.95933067, ...,   1.        ,
          1.        ,   0.        ],
       [ 10.99031765, 132.        ,   5.95933067, ...,   0.        ,
          3.        ,   3.        ],
       [  1.68167148, 115.        ,   5.6       , ...,   0.        ,
          0.        ,   0.        ]])