In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.linear_model import LinearRegression, LogisticRegression
import duckdb
import warnings 
warnings.filterwarnings('ignore')

In [96]:
diabetes_df = pd.read_csv("diabetes_1.csv")

In [97]:
diabetes_df['State'].unique()
state_to_region = {'Northeast': ['New York', 'Massachusetts', 'Pennsylvania', 'Maryland'],
                  'South': ['Florida', 'Texas', 'Tennessee', 'Kentucky', 'North Carolina', 'Virginia', 'Alabama', 'Louisiana', 'Georgia', 'Mississippi', 'Missouri'],
                  'Midwest': ['Illinois', 'Ohio', 'Michigan', 'Wisconsin'], 'West': ['Colorado', 'Washington', 'California', 'Nevada', 'Arizona']}
diabetes_df['Region'] = pd.Series()


In [98]:
for i in range(len(diabetes_df['State'])):
    if diabetes_df['State'][i] in state_to_region['Northeast']:
        diabetes_df['Region'][i] = 'Northeast'
    elif diabetes_df['State'][i] in state_to_region['South']:
        diabetes_df['Region'][i] = 'South'
    elif diabetes_df['State'][i] in state_to_region['Midwest']:
        diabetes_df['Region'][i] = 'Midwest'
    elif diabetes_df['State'][i] in state_to_region['West']:
        diabetes_df['Region'][i] = 'West' 
    else:
        diabetes_df['Region'][i] = 'Unknown'
        
        
     
         
        
        
        

In [99]:
diabetes_df[diabetes_df['State'] == 'Unknown']


Unnamed: 0,ID,Year,Age,Race,Sex,State,Zip_Code,MSA,Enrollment_Type_Categorized,Enrollment_Months,...,sitagliptin_and_metformin_hydrochloride,Avg_ADI,Max_ADI,Min_ADI,SVI1,SVI3,SVI4,SVI,MDI,Region
12403,502017,2017,45,Hispanic,Male,Unknown,48911,Unknown,20,12.0,...,0,75.333333,96.0,38.0,0.281248,0.281334,0.508021,0.375466,,Unknown
12404,502019,2019,47,Hispanic,Male,Unknown,48911,Unknown,20,12.0,...,0,75.333333,96.0,38.0,0.281248,0.281334,0.508021,0.375466,,Unknown
12405,502020,2020,48,Hispanic,Male,Unknown,48911,Unknown,20,12.0,...,0,75.333333,96.0,38.0,0.281248,0.281334,0.508021,0.375466,,Unknown
12406,2552017,2017,76,White,Female,Unknown,48506,Unknown,10,12.0,...,0,82.812500,100.0,63.0,-14.360588,-14.498202,-14.442701,-14.387013,,Unknown
12407,2552019,2019,78,White,Female,Unknown,48506,Unknown,10,12.0,...,0,82.812500,100.0,63.0,-14.360588,-14.498202,-14.442701,-14.387013,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20283,58722017,2017,64,Asian,Female,Unknown,48911,Unknown,20,12.0,...,0,75.333333,96.0,38.0,0.281248,0.281334,0.508021,0.375466,,Unknown
20284,80202019,2019,68,White,Female,Unknown,48504,Unknown,10,12.0,...,0,88.000000,100.0,70.0,-14.360588,-14.498202,-14.442701,-14.387013,,Unknown
20285,93402018,2018,74,White,Male,Unknown,49705,Unknown,10,12.0,...,0,61.200000,85.0,49.0,0.520989,0.158867,0.426078,0.414422,,Unknown
20338,29902017,2017,71,White,Female,Unknown,48917,Unknown,10,12.0,...,0,61.482759,100.0,26.0,-35.236656,-23.311688,-94.646452,-94.703669,,Unknown


In [100]:
region_dummies = pd.get_dummies(diabetes_df['Region'], dtype = int)
race_dummies = pd.get_dummies(diabetes_df['Race'], dtype = int)


In [101]:
gender_dict = {'Male': 0, 'Female': 1}
diabetes_df['Sex'] = diabetes_df['Sex'].map(gender_dict)


In [102]:
diabetes_df['Sex']
print(race_dummies.head())

   Asian  Black  Hispanic  North American Native  Other Race  UnKnown  White
0      0      0         0                      0           0        0      1
1      0      1         0                      0           0        0      0
2      0      0         0                      0           0        0      1
3      0      1         0                      0           0        0      0
4      0      1         0                      0           0        0      0


In [103]:
er = diabetes_df['#_ER'] + diabetes_df['#_ER_Admissions']
er
diabetes_df['ER_Admissions'] = er
diabetes_df.head()

Unnamed: 0,ID,Year,Age,Race,Sex,State,Zip_Code,MSA,Enrollment_Type_Categorized,Enrollment_Months,...,Avg_ADI,Max_ADI,Min_ADI,SVI1,SVI3,SVI4,SVI,MDI,Region,ER_Admissions
0,68802021,2021,68,White,0,Michigan,48176,11460,10,12.0,...,30.157895,67.0,8.0,-27.707059,-18.227551,-27.590337,-27.727856,14.16,Midwest,0
1,3422021,2021,60,Black,0,Michigan,49224,12980,20,12.0,...,80.2,100.0,52.0,-21.266317,0.272289,-21.27153,-21.282509,15.39,Midwest,0
2,7892020,2020,46,White,0,Michigan,49016,12980,20,12.0,...,82.545455,97.0,63.0,0.529493,0.376685,0.616187,0.55524,15.39,Midwest,0
3,42792020,2020,84,Black,1,Michigan,49224,12980,10,12.0,...,80.2,100.0,52.0,-21.266317,0.272289,-21.27153,-21.282509,14.16,Midwest,0
4,42792018,2018,82,Black,1,Michigan,49224,12980,10,12.0,...,80.2,100.0,52.0,-21.266317,0.272289,-21.27153,-21.282509,14.16,Midwest,0


In [104]:
diabetes_with_dummies = pd.concat([diabetes_df, race_dummies, region_dummies], axis = 1)
diabetes_with_dummies

Unnamed: 0,ID,Year,Age,Race,Sex,State,Zip_Code,MSA,Enrollment_Type_Categorized,Enrollment_Months,...,Hispanic,North American Native,Other Race,UnKnown,White,Midwest,Northeast,South,Unknown,West
0,68802021,2021,68,White,0,Michigan,48176,11460,10,12.0,...,0,0,0,0,1,1,0,0,0,0
1,3422021,2021,60,Black,0,Michigan,49224,12980,20,12.0,...,0,0,0,0,0,1,0,0,0,0
2,7892020,2020,46,White,0,Michigan,49016,12980,20,12.0,...,0,0,0,0,1,1,0,0,0,0
3,42792020,2020,84,Black,1,Michigan,49224,12980,10,12.0,...,0,0,0,0,0,1,0,0,0,0
4,42792018,2018,82,Black,1,Michigan,49224,12980,10,12.0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20406,13712019,2019,57,White,0,Michigan,48035,19820,20,12.0,...,0,0,0,0,1,1,0,0,0,0
20407,64542020,2020,77,Black,1,Michigan,48439,22420,10,12.0,...,0,0,0,0,0,1,0,0,0,0
20408,20942017,2017,70,White,0,Michigan,48708,13020,10,12.0,...,0,0,0,0,1,1,0,0,0,0
20409,54602021,2021,70,White,1,Michigan,48838,99022,10,12.0,...,0,0,0,0,1,1,0,0,0,0


In [105]:
diabetes_with_dummies.columns

Index(['ID', 'Year', 'Age', 'Race', 'Sex', 'State', 'Zip_Code', 'MSA',
       'Enrollment_Type_Categorized', 'Enrollment_Months', 'HCC_Score',
       'Avg._LOS', 'Diagnosis', '#_Total_Claims', '#_Hospital_OP',
       '#_All_Physician_OP', '#_Inpatient', '#_Short_Term_Stay_Hospital',
       '#_Long_Term_Stay_Hospital', '#_Rehabilitation_Hospital',
       '#_Psychiatric_Hospital', '#_Readmissions', '%_Readmissions', '#_ER',
       '#_ER_Admissions', '#_SNF', '#_Non_Swing_Bed_SNF_Claim',
       '#_Swing_Bed_SNF_Claim', '#_Home_Health', '#_Hospice', '#_Labs',
       '#_Part_B_Labs', '#_Imaging', '#_Part_B_Imaging', '#_Part_B_E&M',
       '#_Part_B_Drugs', '#_Part_B_Ambulance', '#_Dialysis',
       '#_Part-B_Dialysis', '#_Rx_Claims', '#_DME', '#_Miscellaneous',
       'TotalCost_Y_Actual', 'TotalCost_Y_Expected', 'dapagliflozin',
       'exenatide', 'glimepiride', 'glyburide_and_metformin_hydrochloride',
       'insulin', 'metformin_hydrochloride', 'pioglitazone',
       'sitagliptin_and_me

In [106]:
diabetes_with_dummies= diabetes_with_dummies.rename(columns = {'UnKnown':'Unknown_race', 'Other Race' : 'Other_race', 
                                                              'North American Native': 'North_American_Native', 'Unknown': 
                                                              'Unknown_region'})
print(diabetes_with_dummies)

             ID  Year  Age   Race  Sex     State Zip_Code    MSA  \
0      68802021  2021   68  White    0  Michigan    48176  11460   
1       3422021  2021   60  Black    0  Michigan    49224  12980   
2       7892020  2020   46  White    0  Michigan    49016  12980   
3      42792020  2020   84  Black    1  Michigan    49224  12980   
4      42792018  2018   82  Black    1  Michigan    49224  12980   
...         ...   ...  ...    ...  ...       ...      ...    ...   
20406  13712019  2019   57  White    0  Michigan    48035  19820   
20407  64542020  2020   77  Black    1  Michigan    48439  22420   
20408  20942017  2017   70  White    0  Michigan    48708  13020   
20409  54602021  2021   70  White    1  Michigan    48838  99022   
20410  77822019  2019   47  White    1  Michigan    48340  19820   

       Enrollment_Type_Categorized  Enrollment_Months  ...  Hispanic  \
0                               10               12.0  ...         0   
1                               20     