In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# data partition
from sklearn.model_selection import train_test_split

#filter methods
# spearman 
# chi-square
import scipy.stats as stats
from scipy.stats import chi2_contingency

#wrapper methods
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFE


# embedded methods
from sklearn.linear_model import LassoCV

import warnings
warnings.filterwarnings('ignore')

In [30]:
data_train = pd.read_csv('train.csv')
data_train.head()

Unnamed: 0,encounter_id,country,patient_id,race,gender,age,weight,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,...,secondary_diagnosis,additional_diagnosis,number_diagnoses,glucose_test_result,a1c_test_result,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication,readmitted_binary,readmitted_multiclass
0,533253,USA,70110,Caucasian,Female,[70-80),?,?,0,0,...,276,466,8,,,No,No,[],No,>30 days
1,426224,USA,29775006,AfricanAmerican,Male,[50-60),?,?,0,0,...,785,162,9,,,No,Yes,['insulin'],No,No
2,634063,USA,80729253,Caucasian,Female,[60-70),?,?,0,0,...,135,250,6,,,Ch,Yes,"['glimepiride', 'insulin']",No,No
3,890610,USA,2919042,AfricanAmerican,Male,[60-70),?,MC,0,0,...,562,455,5,,,No,No,[],No,No
4,654194,USA,84871971,Caucasian,Female,[70-80),?,HM,1,0,...,599,428,9,,,No,No,[],No,>30 days


In [31]:
data_validation = pd.read_csv('test.csv')
data_validation.head()

Unnamed: 0,encounter_id,country,patient_id,race,gender,age,weight,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,...,number_of_medications,primary_diagnosis,secondary_diagnosis,additional_diagnosis,number_diagnoses,glucose_test_result,a1c_test_result,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication
0,499502,USA,103232799,Caucasian,Male,[80-90),?,HM,0,0,...,14,491.0,414.0,250,6,,,No,Yes,['metformin']
1,447319,USA,93395304,Caucasian,Male,[20-30),?,HM,0,0,...,7,250.13,70.0,794,7,>300,,No,No,[]
2,309126,USA,6281586,AfricanAmerican,Male,[30-40),?,?,0,0,...,12,786.0,250.6,536,6,,,No,Yes,['insulin']
3,181183,USA,67381308,Caucasian,Male,[50-60),?,BC,0,0,...,16,820.0,873.0,E884,9,,,Ch,Yes,"['metformin', 'glyburide', 'insulin']"
4,359339,USA,71670204,Caucasian,Male,[60-70),?,?,0,0,...,10,599.0,427.0,414,9,,,No,Yes,['metformin']


In [32]:
# data_train.set_index('encounter_id', inplace = True)
data_train.head()

Unnamed: 0,encounter_id,country,patient_id,race,gender,age,weight,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,...,secondary_diagnosis,additional_diagnosis,number_diagnoses,glucose_test_result,a1c_test_result,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication,readmitted_binary,readmitted_multiclass
0,533253,USA,70110,Caucasian,Female,[70-80),?,?,0,0,...,276,466,8,,,No,No,[],No,>30 days
1,426224,USA,29775006,AfricanAmerican,Male,[50-60),?,?,0,0,...,785,162,9,,,No,Yes,['insulin'],No,No
2,634063,USA,80729253,Caucasian,Female,[60-70),?,?,0,0,...,135,250,6,,,Ch,Yes,"['glimepiride', 'insulin']",No,No
3,890610,USA,2919042,AfricanAmerican,Male,[60-70),?,MC,0,0,...,562,455,5,,,No,No,[],No,No
4,654194,USA,84871971,Caucasian,Female,[70-80),?,HM,1,0,...,599,428,9,,,No,No,[],No,>30 days


# <u> Data Exploration </u>

In [33]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71236 entries, 0 to 71235
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   encounter_id                           71236 non-null  int64 
 1   country                                71236 non-null  object
 2   patient_id                             71236 non-null  int64 
 3   race                                   67682 non-null  object
 4   gender                                 71236 non-null  object
 5   age                                    67679 non-null  object
 6   weight                                 71236 non-null  object
 7   payer_code                             71236 non-null  object
 8   outpatient_visits_in_previous_year     71236 non-null  int64 
 9   emergency_visits_in_previous_year      71236 non-null  int64 
 10  inpatient_visits_in_previous_year      71236 non-null  int64 
 11  admission_type 

In [34]:
# Count the occurrences of "?" or NaN in each column
question_mark_nan_count = data_train.applymap(lambda x: x == '?' or pd.isna(x)).sum()

# Calculate the percentage of "?" or NaN values in each column
percentage_question_mark_nan = (question_mark_nan_count / len(data_train)) * 100

# Print the result
print(percentage_question_mark_nan)  # values in percent

# --> Almost no values for weight, glucose_test_result, alc_test_result

encounter_id                              0.000000
country                                   0.000000
patient_id                                0.000000
race                                      7.117188
gender                                    0.000000
age                                       4.993262
weight                                   96.847100
payer_code                               39.588130
outpatient_visits_in_previous_year        0.000000
emergency_visits_in_previous_year         0.000000
inpatient_visits_in_previous_year         0.000000
admission_type                            5.202426
medical_specialty                        49.022966
average_pulse_bpm                         0.000000
discharge_disposition                     3.635802
admission_source                          6.623056
length_of_stay_in_hospital                0.000000
number_lab_tests                          0.000000
non_lab_procedures                        0.000000
number_of_medications          

In [35]:
data_train.describe(include = "all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
encounter_id,71236.0,,,,548798.623716,259704.723154,100000.0,323118.5,548370.0,774058.5,999980.0
country,71236.0,1.0,USA,71236.0,,,,,,,
patient_id,71236.0,,,,54302279.330984,38795850.347332,135.0,23396510.25,45305631.0,87558374.25,189502619.0
race,67682.0,6.0,Caucasian,50693.0,,,,,,,
gender,71236.0,3.0,Female,38228.0,,,,,,,
age,67679.0,10.0,[70-80),17359.0,,,,,,,
weight,71236.0,10.0,?,68990.0,,,,,,,
payer_code,71236.0,18.0,?,28201.0,,,,,,,
outpatient_visits_in_previous_year,71236.0,,,,0.369588,1.287469,0.0,0.0,0.0,0.0,42.0
emergency_visits_in_previous_year,71236.0,,,,0.196249,0.910854,0.0,0.0,0.0,0.0,76.0


In [36]:
data_train['readmitted_binary'].value_counts()

readmitted_binary
No     63286
Yes     7950
Name: count, dtype: int64

In [37]:
data_train['readmitted_multiclass'].value_counts()

readmitted_multiclass
No          38405
>30 days    24881
<30 days     7950
Name: count, dtype: int64

# <u> Pre Processing by columns</u>

## Country

In [38]:
data_train = data_train.drop(columns=['country']) ## only 1 country

## Encounter ID

## Patient ID

## Race

In [39]:
# Replace '?' with NaN to treat them as missing values
data_train['race'].replace('?', np.nan, inplace=True)

# Calculate the frequency of each category in the 'race' variable
race_counts = data_train['race'].value_counts()

# Generate random samples based on the frequency of non-missing values
missing_indices = data_train[data_train['race'].isnull()].index
random_race_fill = np.random.choice(race_counts.index, size=len(missing_indices), p=race_counts / race_counts.sum())

# Fill missing values with random samples
data_train.loc[missing_indices, 'race'] = random_race_fill

In [40]:
data_train['race'].value_counts()

race
Caucasian          54605
AfricanAmerican    13638
Hispanic            1470
Other               1064
Asian                459
Name: count, dtype: int64

In [93]:
## Use One-Hot Encoding technique

# Use pd.get_dummies to one-hot encode the 'race' column
encoded_data_train = pd.get_dummies(data_train, columns=['race'], prefix='race')
data_train = encoded_data_train.replace({True: 1, False: 0})

data_train.head(8)


KeyError: "None of [Index(['race'], dtype='object')] are in the [columns]"

## Age

In [44]:
# Replace '?' with NaN to treat them as missing values
data_train['age'].replace('?', np.nan, inplace=True)

# Calculate the frequency of each category in the 'age' variable
age_counts = data_train['age'].value_counts()

# Generate random samples based on the frequency of non-missing values
missing_indices = data_train[data_train['age'].isnull()].index
random_age_fill = np.random.choice(age_counts.index, size=len(missing_indices), p=age_counts / age_counts.sum())

# Fill missing values with random samples
data_train.loc[missing_indices, 'age'] = random_age_fill

## Weight

In [45]:
data_train = data_train.drop(columns=['weight']) ## too many missing values as mentioned above
data_train

Unnamed: 0,encounter_id,patient_id,gender,age,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,admission_type,medical_specialty,...,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication,readmitted_binary,readmitted_multiclass,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other
0,533253,70110,Female,[70-80),?,0,0,2,Emergency,Family/GeneralPractice,...,No,No,[],No,>30 days,0,0,1,0,0
1,426224,29775006,Male,[50-60),?,0,0,0,Emergency,?,...,No,Yes,['insulin'],No,No,1,0,0,0,0
2,634063,80729253,Female,[60-70),?,0,0,1,,Family/GeneralPractice,...,Ch,Yes,"['glimepiride', 'insulin']",No,No,0,0,1,0,0
3,890610,2919042,Male,[60-70),MC,0,0,1,Emergency,InternalMedicine,...,No,No,[],No,No,1,0,0,0,0
4,654194,84871971,Female,[70-80),HM,1,0,0,Elective,?,...,No,No,[],No,>30 days,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71231,660160,24531381,Female,[80-90),MC,0,1,0,Elective,InternalMedicine,...,Ch,Yes,"['metformin', 'glyburide', 'insulin']",No,No,1,0,0,0,0
71232,826429,4663818,Female,[70-80),?,0,0,0,Urgent,?,...,Ch,Yes,"['metformin', 'glyburide']",No,No,1,0,0,0,0
71233,332030,23397147,Female,[60-70),?,0,2,2,,?,...,Ch,Yes,"['glyburide', 'insulin']",Yes,<30 days,0,0,1,0,0
71234,757560,52161750,Male,[60-70),BC,0,0,2,Emergency,Emergency/Trauma,...,Ch,Yes,"['glyburide', 'insulin']",No,No,0,0,1,0,0


## Payer Code

In [47]:
## No idea yet what is the best approach for this one. LabelEncoder or One-Got Encoder

## Outpatient_visits_in_previous_year

In [85]:
## we categorize in 
## 1: Once or more 
## 0: Never

data_train['outpatient_visits_in_previous_year'] = data_train['outpatient_visits_in_previous_year'].map(lambda x: 0 if x > 0 else 1)
print(data_train['outpatient_visits_in_previous_year'].value_counts())
data_train['outpatient_visits_in_previous_year'].value_counts().sum()

outpatient_visits_in_previous_year
0    59587
1    11649
Name: count, dtype: int64


71236

## Emergency_visits_in_previous_year

In [74]:
## since 89% have 0, we categorize again by 1: once or more and 0: Never
data_train['emergency_visits_in_previous_year'] = data_train['emergency_visits_in_previous_year'].map(lambda x: 0 if x > 0 else 1)

In [84]:
print(data_train['emergency_visits_in_previous_year'].value_counts())
data_train['emergency_visits_in_previous_year'].value_counts().sum()

emergency_visits_in_previous_year
1    63242
0     7994
Name: count, dtype: int64


71236

## Inpatient_visits_in_previous_year	

In [86]:
data_train['inpatient_visits_in_previous_year'] = data_train['inpatient_visits_in_previous_year'].map(lambda x: 0 if x > 0 else 1)
print(data_train['inpatient_visits_in_previous_year'].value_counts())
data_train['inpatient_visits_in_previous_year'].value_counts().sum()

inpatient_visits_in_previous_year
1    47231
0    24005
Name: count, dtype: int64


71236

## Admission_type

In [90]:
data_train['admission_type'].value_counts()

admission_type
Emergency        37742
Elective         13211
Urgent           13024
Not Available     3320
Not Mapped         214
Trauma Center       13
Newborn              6
Name: count, dtype: int64

In [95]:
# Replace '?' with NaN to treat them as missing values
data_train['admission_type'].replace('?', np.nan, inplace=True)

# Calculate the frequency of each category in the 'age' variable
admission_type_counts = data_train['admission_type'].value_counts()

# Generate random samples based on the frequency of non-missing values
missing_indices_admission = data_train[data_train['admission_type'].isnull()].index
random_admission_fill = np.random.choice(admission_type_counts.index, size=len(missing_indices_admission), p=admission_type_counts / admission_type_counts.sum())

# Fill missing values with random samples
data_train.loc[missing_indices_admission, 'admission_type'] = random_admission_fill

#
#
#
#
# tbc
#
#
#
#