https://archive.ics.uci.edu/dataset/880/support2

In [None]:
# !pip install ucimlrepo

In [1]:
import pandas as pd 

In [2]:
from ucimlrepo import fetch_ucirepo

In [3]:
support2 = fetch_ucirepo(id=880)

In [4]:
support2

{'data': {'ids':         id
  0        1
  1        2
  2        3
  3        4
  4        5
  ...    ...
  9100  9101
  9101  9102
  9102  9103
  9103  9104
  9104  9105
  
  [9105 rows x 1 columns],
  'features':            age     sex            dzgroup             dzclass  num.co   edu  \
  0     62.84998    male        Lung Cancer              Cancer       0  11.0   
  1     60.33899  female          Cirrhosis  COPD/CHF/Cirrhosis       2  12.0   
  2     52.74698  female          Cirrhosis  COPD/CHF/Cirrhosis       2  12.0   
  3     42.38498  female        Lung Cancer              Cancer       2  11.0   
  4     79.88495  female  ARF/MOSF w/Sepsis            ARF/MOSF       1   NaN   
  ...        ...     ...                ...                 ...     ...   ...   
  9100  66.07300    male  ARF/MOSF w/Sepsis            ARF/MOSF       1   8.0   
  9101  55.15399  female               Coma                Coma       1  11.0   
  9102  70.38196    male  ARF/MOSF w/Sepsis            ARF

In [5]:
support2.metadata

{'uci_id': 880,
 'name': 'SUPPORT2',
 'repository_url': 'https://archive.ics.uci.edu/dataset/880/support2',
 'data_url': 'https://archive.ics.uci.edu/static/public/880/data.csv',
 'abstract': "This dataset comprises 9105 individual critically ill patients across 5 United States medical centers, accessioned throughout 1989-1991 and 1992-1994.\nEach row concerns hospitalized patient records who met the inclusion and exclusion criteria for nine disease categories: acute respiratory failure, chronic obstructive pulmonary disease, congestive heart failure, liver disease, coma, colon cancer, lung cancer, multiple organ system failure with malignancy, and multiple organ system failure with sepsis. The goal is to determine these patients' 2- and 6-month survival rates based on several physiologic, demographics, and disease severity information. \nIt is an important problem because it addresses the growing national concern over patients' loss of control near the end of life. It enables earlier 

In [6]:
support2.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,id,ID,Integer,,,,no
1,age,Feature,Continuous,Age,Age of the patients in years,years,no
2,death,Target,Continuous,,Death at any time up to National Death Index (...,,no
3,sex,Feature,Categorical,Sex,Gender of the patient. Listed values are {male...,,no
4,hospdead,Target,Binary,,Death in hospital,,no
5,slos,Other,Continuous,,Days from Study Entry to Discharge,,no
6,d.time,Other,Continuous,,Days of follow-up,,no
7,dzgroup,Feature,Categorical,,The patient's disease sub category amogst ARF/...,,no
8,dzclass,Feature,Categorical,,"The patient's disease category amongst ""ARF/MO...",,no
9,num.co,Feature,Continuous,,The number of simultaneous diseases (or comorb...,,no


In [8]:
features_df = pd.DataFrame(support2['data']['features'])  # This will be the feature columns
targets_df = pd.DataFrame(support2['data']['targets'])    # This will be the target columns


In [10]:
features_df

Unnamed: 0,age,sex,dzgroup,dzclass,num.co,edu,income,scoma,charges,totcst,...,bili,crea,sod,ph,glucose,bun,urine,adlp,adls,adlsc
0,62.84998,male,Lung Cancer,Cancer,0,11.0,$11-$25k,0.0,9715.0,,...,0.199982,1.199951,141.0,7.459961,,,,7.0,7.0,7.000000
1,60.33899,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,$11-$25k,44.0,34496.0,,...,,5.500000,132.0,7.250000,,,,,1.0,1.000000
2,52.74698,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,under $11k,0.0,41094.0,,...,2.199707,2.000000,134.0,7.459961,,,,1.0,0.0,0.000000
3,42.38498,female,Lung Cancer,Cancer,2,11.0,under $11k,0.0,3075.0,,...,,0.799927,139.0,,,,,0.0,0.0,0.000000
4,79.88495,female,ARF/MOSF w/Sepsis,ARF/MOSF,1,,,26.0,50127.0,,...,,0.799927,143.0,7.509766,,,,,2.0,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,66.07300,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,8.0,,0.0,52870.0,34329.3125,...,0.399963,1.099854,131.0,7.459961,188.0,21.0,,,0.0,0.000000
9101,55.15399,female,Coma,Coma,1,11.0,,41.0,35377.0,23558.5000,...,,5.899414,135.0,7.289062,190.0,49.0,0.0,,0.0,0.000000
9102,70.38196,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,,,0.0,46564.0,31409.0156,...,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,,,2.525391
9103,47.01999,male,MOSF w/Malig,ARF/MOSF,1,13.0,,0.0,58439.0,,...,0.399963,3.500000,135.0,7.469727,246.0,55.0,,,0.0,0.000000


In [9]:
targets_df

Unnamed: 0,death,hospdead,sfdm2
0,0,0,
1,1,1,<2 mo. follow-up
2,1,0,<2 mo. follow-up
3,1,0,no(M2 and SIP pres)
4,0,0,no(M2 and SIP pres)
...,...,...,...
9100,0,0,
9101,0,0,
9102,0,0,
9103,1,1,<2 mo. follow-up


In [12]:
# Merging features and targets DataFrame
df = pd.concat([targets_df, features_df], axis=1)
df

Unnamed: 0,death,hospdead,sfdm2,age,sex,dzgroup,dzclass,num.co,edu,income,...,bili,crea,sod,ph,glucose,bun,urine,adlp,adls,adlsc
0,0,0,,62.84998,male,Lung Cancer,Cancer,0,11.0,$11-$25k,...,0.199982,1.199951,141.0,7.459961,,,,7.0,7.0,7.000000
1,1,1,<2 mo. follow-up,60.33899,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,$11-$25k,...,,5.500000,132.0,7.250000,,,,,1.0,1.000000
2,1,0,<2 mo. follow-up,52.74698,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,under $11k,...,2.199707,2.000000,134.0,7.459961,,,,1.0,0.0,0.000000
3,1,0,no(M2 and SIP pres),42.38498,female,Lung Cancer,Cancer,2,11.0,under $11k,...,,0.799927,139.0,,,,,0.0,0.0,0.000000
4,0,0,no(M2 and SIP pres),79.88495,female,ARF/MOSF w/Sepsis,ARF/MOSF,1,,,...,,0.799927,143.0,7.509766,,,,,2.0,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,0,0,,66.07300,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,8.0,,...,0.399963,1.099854,131.0,7.459961,188.0,21.0,,,0.0,0.000000
9101,0,0,,55.15399,female,Coma,Coma,1,11.0,,...,,5.899414,135.0,7.289062,190.0,49.0,0.0,,0.0,0.000000
9102,0,0,,70.38196,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,,,...,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,,,2.525391
9103,1,1,<2 mo. follow-up,47.01999,male,MOSF w/Malig,ARF/MOSF,1,13.0,,...,0.399963,3.500000,135.0,7.469727,246.0,55.0,,,0.0,0.000000


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9105 entries, 0 to 9104
Data columns (total 45 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   death     9105 non-null   int64  
 1   hospdead  9105 non-null   int64  
 2   sfdm2     7705 non-null   object 
 3   age       9105 non-null   float64
 4   sex       9105 non-null   object 
 5   dzgroup   9105 non-null   object 
 6   dzclass   9105 non-null   object 
 7   num.co    9105 non-null   int64  
 8   edu       7471 non-null   float64
 9   income    6123 non-null   object 
 10  scoma     9104 non-null   float64
 11  charges   8933 non-null   float64
 12  totcst    8217 non-null   float64
 13  totmcst   5630 non-null   float64
 14  avtisst   9023 non-null   float64
 15  race      9063 non-null   object 
 16  sps       9104 non-null   float64
 17  aps       9104 non-null   float64
 18  surv2m    9104 non-null   float64
 19  surv6m    9104 non-null   float64
 20  hday      9105 non-null   int6

In [14]:
df.describe()

Unnamed: 0,death,hospdead,age,num.co,edu,scoma,charges,totcst,totmcst,avtisst,...,bili,crea,sod,ph,glucose,bun,urine,adlp,adls,adlsc
count,9105.0,9105.0,9105.0,9105.0,7471.0,9104.0,8933.0,8217.0,5630.0,9023.0,...,6504.0,9038.0,9104.0,6821.0,4605.0,4753.0,4243.0,3464.0,6238.0,9105.0
mean,0.681054,0.259198,62.650823,1.868644,11.747691,12.058546,59995.79,30825.867768,28828.877838,22.610928,...,2.554463,1.770961,137.568541,7.415364,159.873398,32.349463,2191.546047,1.15791,1.637384,1.888272
std,0.466094,0.438219,15.59371,1.344409,3.447743,24.636694,102648.8,45780.820986,43604.261932,13.233248,...,5.318448,1.686041,6.029326,0.080563,88.391541,26.792288,1455.245777,1.739672,2.231358,2.003763
min,0.0,0.0,18.04199,0.0,0.0,0.0,1169.0,0.0,-102.71997,1.0,...,0.099991,0.099991,110.0,6.829102,0.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,52.797,1.0,10.0,0.0,9740.0,5929.5664,5177.4043,12.0,...,0.5,0.899902,134.0,7.379883,103.0,14.0,1165.5,0.0,0.0,0.0
50%,1.0,0.0,64.85699,2.0,12.0,0.0,25024.0,14452.7344,13223.5,19.5,...,0.899902,1.199951,137.0,7.419922,135.0,23.0,1968.0,0.0,1.0,1.0
75%,1.0,1.0,73.99896,3.0,14.0,9.0,64598.0,36087.9375,34223.6016,31.666656,...,1.899902,1.899902,141.0,7.469727,188.0,42.0,3000.0,2.0,3.0,3.0
max,1.0,1.0,101.84796,9.0,31.0,100.0,1435423.0,633212.0,710682.0,83.0,...,63.0,21.5,181.0,7.769531,1092.0,300.0,9000.0,7.0,7.0,7.073242


In [15]:
df.describe(include=object)

Unnamed: 0,sfdm2,sex,dzgroup,dzclass,income,race,ca,dnr
count,7705,9105,9105,9105,6123,9063,9105,9075
unique,5,2,8,4,4,5,3,3
top,<2 mo. follow-up,male,ARF/MOSF w/Sepsis,ARF/MOSF,under $11k,white,no,no dnr
freq,3123,5125,3515,4227,2855,7191,5995,5880


| Variable Name | Role     | Type       | Demographic | Description                                                                                                                                  | Units  | Missing Values |
|---------------|----------|------------|-------------|----------------------------------------------------------------------------------------------------------------------------------------------|--------|-----------------|
| id            | ID       | Integer    |             |                                                                                                                                                |        | no              |
| age           | Feature  | Continuous | Age         | Age of the patients in years                                                                                                                  | years  | no              |
| death         | Target   | Continuous |             | Death at any time up to National Death Index (NDI) data on 31 of December of 1994. Some patients are discharged before the end of the study. |        | no              |
| sex           | Feature  | Categorical| Sex         | Gender of the patient. Listed values are {male, female}.                                                                                      |        | no              |
| hospdead      | Target   | Binary     |             | Death in hospital                                                                                                                             |        | no              |
| slos          | Other    | Continuous |             | Days from Study Entry to Discharge                                                                                                            |        | no              |
| d.time        | Other    | Continuous |             | Days of follow-up                                                                                                                             |        | no              |
| dzgroup       | Feature  | Categorical|             | The patient's disease subcategory amongst ARF/MOSF w/Sepsis, CHF, COPD, Cirrhosis, Colon Cancer, Coma, Lung Cancer, MOSF w/Malig.            |        | no              |
| dzclass       | Feature  | Categorical|             | The patient's disease category amongst "ARF/MOSF", "COPD/CHF/Cirrhosis", "Cancer", "Coma".                                                   |        | no              |
| num.co        | Feature  | Continuous |             | The number of simultaneous diseases (or comorbidities) exhibited by the patient. Values are ordinal with higher values indicating worse condition and chances of survival. |        | no              |
| edu           | Feature  | Categorical| Education   | Years of education                                                                                                                           | years  | yes             |
| income        | Feature  | Categorical| Income      | Income of the patient. Listed values are {"$11-$25k", "$25-$50k", ">$50k", "under $11k"}.                                                     |        | yes             |
| scoma         | Feature  | Continuous |             | SUPPORT day 3 Coma Score based on Glasgow scale (predicted by a model).                                                                       |        | yes             |
| charges       | Feature  | Continuous |             | Hospital charges                                                                                                                             |        | yes             |
| totcst        | Feature  | Continuous |             | Total ratio of costs to charges (RCC) cost                                                                                                    |        | yes             |
| totmcst       | Feature  | Continuous |             | Total micro cost                                                                                                                             |        | yes             |
| avtisst       | Feature  | Continuous |             | Average TISS score, days 3-25, where Therapeutic Intervention Scoring System (TISS) is a method for calculating costs in the ICU and IMCU.    |        | yes             |
| race          | Feature  | Categorical| Race        | Race of the patient. Listed values are {asian, black, hispanic, missing, other, white}.                                                      |        | yes             |
| sps           | Feature  | Continuous |             | SUPPORT physiology score on day 3 (predicted by a model).                                                                                   |        | yes             |
| aps           | Feature  | Continuous |             | APACHE III day 3 physiology score (no coma, imp bun, uout for ph1)                                                                             |        | yes             |
| surv2m        | Feature  | Continuous |             | SUPPORT model 2-month survival estimate at day 3 (predicted by a model)                                                                       |        | yes             |
| surv6m        | Feature  | Continuous |             | SUPPORT model 6-month survival estimate at day 3 (predicted by a model)                                                                       |        | yes             |
| hday          | Feature  | Integer    |             | Day in hospital at which patient entered study.                                                                                                |        | no              |
| diabetes      | Feature  | Continuous |             | Whether the patient exhibits diabetes (Com 27-28, Dx 73) as a comorbidity (Y) or not (N).                                                      |        | no              |
| dementia      | Feature  | Continuous |             | Whether the patient exhibits dementia (Comorbidity 6) as a comorbidity (Y) or not (N).                                                        |        | no              |
| ca            | Feature  | Categorical|             | Whether the patient has cancer (yes), whether it has spread out (metastatic), or if it is healthy (no).                                        |        | no              |
| prg2m         | Feature  | Continuous |             | Physician’s 2-month survival estimate for patient.                                                                                             |        | yes             |
| prg6m         | Feature  | Categorical|             | Physician’s 6-month survival estimate for patient.                                                                                             |        | yes             |
| dnr           | Feature  | Categorical|             | Whether the patient has a do not resuscitate (DNR) order or not. Possible values are dnr after sadm, dnr before sadm, missing, no dnr.         |        | yes             |
| dnrday        | Feature  | Continuous |             | Day of DNR order (<0 if before study)                                                                                                          |        | yes             |
| meanbp        | Feature  | Continuous |             | Mean arterial blood pressure of the patient, measured at day 3.                                                                                |        | yes             |
| wblc          | Feature  | Continuous |             | Counts of white blood cells (in thousands) measured at day 3.                                                                                 |        | yes             |
| hrt           | Feature  | Continuous |             | Heart rate of the patient measured at day 3.                                                                                                  |        | yes             |
| resp          | Feature  | Continuous |             | Respiration rate of the patient measured at day 3.                                                                                             |        | yes             |
| temp          | Feature  | Continuous |             | Temperature in Celsius degrees measured at day 3.                                                                                             | Celsius| no              |
| pafi          | Feature  | Continuous |             | PaO2/FiO2 ratio measured at day 3, a clinical indicator of hypoxemia.                                                                          |        | yes             |
| alb           | Feature  | Continuous |             | Serum albumin levels measured at day 3.                                                                                                       |        | yes             |
| bili          | Feature  | Continuous |             | Bilirubin levels measured at day 3.                                                                                                           |        | yes             |
| crea          | Feature  | Continuous |             | Serum creatinine levels measured at day 3.                                                                                                    |        | yes             |
| sod           | Feature  | Continuous |             | Serum sodium concentration measured at day 3.                                                                                                 |        | yes             |
| ph            | Feature  | Continuous |             | Arterial blood pH, the pH of blood is usually between 7.35 and 7.45. Abnormal results may be due to lung, kidney, or metabolic diseases.       |        | yes             |
| glucose       | Feature  | Integer    |             | Glucose levels measured at day 3.                                                                                                             |        | yes             |
| bun           | Feature  | Integer    |             | Blood urea nitrogen levels measured at day 3.                                                                                                 |        | yes             |
| urine         | Feature  | Integer    |             | Urine output measured at day 3.                                                                                                               |        | yes             |
| adlp          | Feature  | Categorical|             | Index of Activities of Daily Living (ADL) of the patient, filled out by the patient. Higher values indicate more chance of survival.           |        | yes             |
| adls          | Feature  | Continuous |             | Index of Activities of Daily Living (ADL) of the patient, filled out by a surrogate (e.g. family member), higher values indicate better chances of survival. |        | yes             |
| sfdm2         | Target   | Categorical|             | Level of functional disability of the patient in a 1-5 scale. Values correlate with ADLS and ADLP columns.                                     |        | yes             |
| adlsc         | Feature  | Continuous |             | Imputed ADL calibrated to Surrogate.                                                                                                           |        | no              |


In [16]:
df.to_csv('front_service_1.csv', index=False)

## Patient Risk Classifier

## Survival Analysis

In [14]:
from lifelines import CoxPHFitter
import numpy as np

In [16]:
df['sex'] = df['sex'].map({'male': 1, 'female': 0})


In [17]:
# Prepare data for survival analysis
survival_data = df[['age', 'sex', 'dzclass', 'meanbp', 'sps', 'surv2m', 'surv6m']].dropna()
survival_data['event'] = (survival_data['surv2m'] == 0).astype(int)  # 0 if survived at 2 months

# Model with Cox Proportional Hazards
cph = CoxPHFitter()
cph.fit(survival_data, duration_col='surv2m', event_col='event')
cph.print_summary()

ValueError: could not convert string to float: 'ARF/MOSF'

## Cost Prediction