In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_curve, auc, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'xgboost'

In [2]:
file_paths = {
    "Claims": "./HHP_release3/Claims.csv",
    "DaysInHospital_Y2": "./HHP_release3/DaysInHospital_Y2.csv",
    "DaysInHospital_Y3": "./HHP_release3/DaysInHospital_Y3.csv",
    "DrugCount": "./HHP_release3/DrugCount.csv",
    "LabCount": "./HHP_release3/LabCount.csv",
    "Members": "./HHP_release3/Members.csv",
    "Target": "./HHP_release3/Target.csv"
}

# Load datasets
claims_df = pd.read_csv(file_paths['Claims'])
days_in_hospital_y2_df = pd.read_csv(file_paths['DaysInHospital_Y2'])
days_in_hospital_y3_df = pd.read_csv(file_paths['DaysInHospital_Y3'])
drug_count_df = pd.read_csv(file_paths['DrugCount'])
lab_count_df = pd.read_csv(file_paths['LabCount'])
members_df = pd.read_csv(file_paths['Members'])
target_df = pd.read_csv(file_paths['Target'])

In [3]:
def missing_data_summary(df, dataset_name):
    print(f"Missing values in {dataset_name}:")
    missing_data = df.isnull().sum()
    missing_percentage = (missing_data / len(df)) * 100
    missing_summary = pd.DataFrame({'Missing Values': missing_data, 'Percentage': missing_percentage})
    print(missing_summary[missing_summary['Missing Values'] > 0])  # Only show columns with missing values
    print("\n")

# Get missing data summary for each dataset
missing_data_summary(claims_df, "Claims")
missing_data_summary(days_in_hospital_y2_df, "DaysInHospital_Y2")
missing_data_summary(days_in_hospital_y3_df, "DaysInHospital_Y3")
missing_data_summary(drug_count_df, "DrugCount")
missing_data_summary(lab_count_df, "LabCount")
missing_data_summary(members_df, "Members")
missing_data_summary(target_df, "Target")

Missing values in Claims:
                       Missing Values  Percentage
ProviderID                      16264    0.609369
Vendor                          24856    0.931289
PCP                              7492    0.280705
Specialty                        8405    0.314913
PlaceSvc                         7632    0.285951
LengthOfStay                  2597392   97.317412
DSFS                            52770    1.977152
PrimaryConditionGroup           11410    0.427503
ProcedureGroup                   3675    0.137693


Missing values in DaysInHospital_Y2:
Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


Missing values in DaysInHospital_Y3:
Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


Missing values in DrugCount:
Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


Missing values in LabCount:
Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


Missing values in Members:
                 Missing Values  Percentage
AgeAtF

In [5]:
# Define mapping for 'DSFS' conversion to numeric
dsfs_mapping = {
    '0- 1 month': 1,
    '1- 2 months': 2,
    '2- 3 months': 3,
    '3- 4 months': 4,
    '4- 5 months': 5,
    '5- 6 months': 6,
    '6- 7 months': 7,
    '7- 8 months': 8,
    '8- 9 months': 9,
    '9-10 months': 10,
    '10-11 months': 11,
    '11-12 months': 12
}

# Apply the mapping to the DSFS column to convert it to numeric
claims_df['DSFS'] = claims_df['DSFS'].map(dsfs_mapping)

# Fill the missing DSFS values using the median
claims_df['DSFS'].fillna(claims_df['DSFS'].median(), inplace=True)

# Imputation for other columns (unchanged)
claims_df['ProviderID'].fillna(claims_df['ProviderID'].mode()[0], inplace=True)
claims_df['Vendor'].fillna(claims_df['Vendor'].mode()[0], inplace=True)
claims_df['PCP'].fillna(claims_df['PCP'].mode()[0], inplace=True)
claims_df['Specialty'].fillna(claims_df['Specialty'].mode()[0], inplace=True)
claims_df['PlaceSvc'].fillna(claims_df['PlaceSvc'].mode()[0], inplace=True)
claims_df['PrimaryConditionGroup'].fillna(claims_df['PrimaryConditionGroup'].mode()[0], inplace=True)
claims_df['ProcedureGroup'].fillna(claims_df['ProcedureGroup'].mode()[0], inplace=True)

# Dropping 'LengthOfStay' due to high missing values
claims_df.drop(columns=['LengthOfStay'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  claims_df['DSFS'].fillna(claims_df['DSFS'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  claims_df['ProviderID'].fillna(claims_df['ProviderID'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because t

In [7]:
# Merge Claims and Members datasets on 'MemberID'
merged_df = pd.merge(claims_df, members_df, on='MemberID', how='left')

# Merge DaysInHospital_Y2 and DaysInHospital_Y3
merged_df = pd.merge(merged_df, days_in_hospital_y2_df, on='MemberID', how='left')
merged_df = pd.merge(merged_df, days_in_hospital_y3_df, on='MemberID', how='left')

# Merge DrugCount and LabCount on 'MemberID' and 'Year'
merged_df = pd.merge(merged_df, drug_count_df, on=['MemberID', 'Year'], how='left')
merged_df = pd.merge(merged_df, lab_count_df, on=['MemberID', 'Year'], how='left')

# Merge the Target dataset
merged_df = pd.merge(merged_df, target_df[['MemberID', 'DaysInHospital']], on='MemberID', how='left')

# Check the final merged dataframe
print("Merged dataset shape:", merged_df.shape)

Merged dataset shape: (58297637, 24)


In [8]:
print(merged_df.head())

   MemberID  ProviderID    Vendor      PCP Year Specialty PlaceSvc PayDelay  \
0  42286978   8013252.0  172193.0  37796.0   Y1   Surgery   Office       28   
1  42286978   8013252.0  172193.0  37796.0   Y1   Surgery   Office       28   
2  42286978   8013252.0  172193.0  37796.0   Y1   Surgery   Office       28   
3  42286978   8013252.0  172193.0  37796.0   Y1   Surgery   Office       28   
4  42286978   8013252.0  172193.0  37796.0   Y1   Surgery   Office       28   

   DSFS_x PrimaryConditionGroup  ... Sex ClaimsTruncated_x  DaysInHospital_x  \
0     9.0               NEUMENT  ...   F               0.0               2.0   
1     9.0               NEUMENT  ...   F               0.0               2.0   
2     9.0               NEUMENT  ...   F               0.0               2.0   
3     9.0               NEUMENT  ...   F               0.0               2.0   
4     9.0               NEUMENT  ...   F               0.0               2.0   

  ClaimsTruncated_y DaysInHospital_y        

In [9]:
print(merged_df.columns)

Index(['MemberID', 'ProviderID', 'Vendor', 'PCP', 'Year', 'Specialty',
       'PlaceSvc', 'PayDelay', 'DSFS_x', 'PrimaryConditionGroup',
       'CharlsonIndex', 'ProcedureGroup', 'SupLOS', 'AgeAtFirstClaim', 'Sex',
       'ClaimsTruncated_x', 'DaysInHospital_x', 'ClaimsTruncated_y',
       'DaysInHospital_y', 'DSFS_y', 'DrugCount', 'DSFS', 'LabCount',
       'DaysInHospital'],
      dtype='object')


In [10]:
# Separate the features into different groups
demographic_features = ['AgeAtFirstClaim', 'Sex']
medical_history_features = ['CharlsonIndex', 'PrimaryConditionGroup', 'ProcedureGroup']
treatment_features = ['DrugCount', 'LabCount']

# Combine all features
features = demographic_features + medical_history_features + treatment_features

In [13]:
target = 'DaysInHospital'