In [3]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from scipy.stats import skew


import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read the CSV files

case_data_4Yrs = pd.read_csv('3yrs_Case_Data.csv')
control_data_4Yrs = pd.read_csv('3yrs_Control_Data.csv')

In [None]:
# Check the number of unique PATID values in each DataFrame
print("Number of unique PATID values in case_data:", case_data['PATID'].nunique())
print("Number of unique PATID values in control_data:", control_data['PATID'].nunique())


In [4]:
# Concatenate the two datasets along the rows
data = pd.concat([case_data, control_data], ignore_index=True)

# Print the combined DataFrame info to verify the changes
print(data.info())

# Print the first few rows to check the result
print(data.head())

In [6]:
# Find the number of missing values per column

data.isnull().sum(axis=0)
print(data.isnull().sum(axis=0))

PATID               0
Sex                 0
Race                6
Marital_Status     13
Max_DBP            56
Max_SBP            40
Min_DBP            56
Min_SBP            40
Comorbidities     639
Smoking_Status    350
Encounter Type      0
Age_Grp             0
Target              0
dtype: int64


# Check for Symmetry and skewness to decide whether to use the mean, median or mode for imputing missing values in numerical data

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123261 entries, 0 to 123260
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PATID           123261 non-null  int64  
 1   Sex             123261 non-null  object 
 2   Race            123255 non-null  object 
 3   Marital_Status  123248 non-null  object 
 4   Max_DBP         123205 non-null  float64
 5   Max_SBP         123221 non-null  float64
 6   Min_DBP         123205 non-null  float64
 7   Min_SBP         123221 non-null  float64
 8   Comorbidities   122622 non-null  object 
 9   Smoking_Status  122911 non-null  object 
 10  Encounter Type  123261 non-null  object 
 11  Age_Grp         123261 non-null  object 
 12  Target          123261 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 12.2+ MB


In [8]:
# Check the number of missing values in each column
missing_values_after_imputation = data.isnull().sum()

# Calculate the percentage of missing values in each column
percentage_missing = (missing_values_after_imputation / len(data)) * 100

# Print the number of missing values and their percentage
missing_data_summary = pd.DataFrame({
    'Missing Values': missing_values_after_imputation,
    'Percentage Missing': percentage_missing
})


print(missing_data_summary)

                Missing Values  Percentage Missing
PATID                        0            0.000000
Sex                          0            0.000000
Race                         6            0.004868
Marital_Status              13            0.010547
Max_DBP                     56            0.045432
Max_SBP                     40            0.032451
Min_DBP                     56            0.045432
Min_SBP                     40            0.032451
Comorbidities              639            0.518412
Smoking_Status             350            0.283950
Encounter Type               0            0.000000
Age_Grp                      0            0.000000
Target                       0            0.000000


In [5]:
# Define the numerical columns
numerical_columns = ['Max_DBP', 'Max_SBP', 'Min_DBP', 'Min_SBP']

for column in numerical_columns:
    # Calculate the column skewness
    column_skewness = skew(data[column].dropna())
    
    # Plot the distribution and a boxplot
    plt.figure(figsize=(14, 6))
    
    # Histogram
    plt.subplot(1, 2, 1)
    sns.histplot(data[column].dropna(), kde=True)
    plt.title(f'Distribution of {column} (Skewness: {column_skewness:.2f})')
    
    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(x=data[column])
    plt.title(f'Boxplot of {column}')
    
    # Save the plot before showing it
    plt.savefig(f'{column}_distribution_boxplot.png')
    
    
    plt.show()

    # Decide whether to use mean or median
    if abs(column_skewness) < 0.5:
        print(f"{column}: Distribution is fairly symmetric. Impute missing values with mean.")
    else:
        print(f"{column}: Distribution is skewed or has outliers. Impute missing values with median.")


# Impute missing values for the numerical features

In [10]:
# Impute missing values with median for skewed distributions or columns with outliers

data['Max_DBP'].fillna(data['Max_DBP'].median(), inplace=True)
data['Max_SBP'].fillna(data['Max_SBP'].median(), inplace=True)
data['Min_SBP'].fillna(data['Min_SBP'].median(), inplace=True)
data['Min_DBP'].fillna(data['Min_DBP'].median(), inplace=True)

In [11]:
# Check the number of missing values in each column
missing_values_after_imputation = data.isnull().sum()
missing_values_after_imputation

PATID               0
Sex                 0
Race                6
Marital_Status     13
Max_DBP             0
Max_SBP             0
Min_DBP             0
Min_SBP             0
Comorbidities     639
Smoking_Status    350
Encounter Type      0
Age_Grp             0
Target              0
dtype: int64

# Mapping and Encoding Diastolic Blood Pressure (DBP), and Systolic Blood Pressure (SBP)

In [6]:
# Create SBP and DBP categories with desired labels
data['SBP_Category'] = pd.cut(data['Max_SBP'], bins=[-float('inf'), 120, 140, float('inf')], labels=['SBP <=120', 'SBP 120-140', 'SBP >=140'])
data['DBP_Category'] = pd.cut(data['Max_DBP'], bins=[-float('inf'), 80, 90, float('inf')], labels=['DBP <=80', 'DBP 80-90', 'DBP >=90'])

# Perform one-hot encoding with the correct names
data = pd.get_dummies(data, columns=['SBP_Category', 'DBP_Category'])

# Rename the columns to match your format
data = data.rename(columns={
    'SBP_Category_SBP <=120': 'SBP below 120',
    'SBP_Category_SBP 120-140': 'SBP 120-140',
    'SBP_Category_SBP >=140': 'SBP above 140',
    'DBP_Category_DBP <=80': 'DBP below 80',
    'DBP_Category_DBP 80-90': 'DBP 80-90',
    'DBP_Category_DBP >=90': 'DBP above 90'
})

# Remove the original columns
data = data.drop(columns=['Max_SBP', 'Min_SBP', 'Max_DBP', 'Min_DBP'])

# Display the resulting dataframe
print(data.head())

# Check and print the count for each category
print("Count for each SBP category:")
print(f"SBP below 120: {data['SBP below 120'].sum()}")
print(f"SBP 120-140: {data['SBP 120-140'].sum()}")
print(f"SBP above 140: {data['SBP above 140'].sum()}")

print("\nCount for each DBP category:")
print(f"DBP below 80: {data['DBP below 80'].sum()}")
print(f"DBP 80-90: {data['DBP 80-90'].sum()}")
print(f"DBP above 90: {data['DBP above 90'].sum()}")


In [13]:
# Check the number of missing values in each column
missing_values = data.isnull().sum()

# Calculate the percentage of missing values in each column
percentage_missing = (missing_values / len(data)) * 100

# Print the number of missing values and their percentage
missing_data_summary = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage Missing': percentage_missing
})


print(missing_data_summary)


                Missing Values  Percentage Missing
PATID                        0            0.000000
Sex                          0            0.000000
Race                         6            0.004868
Marital_Status              13            0.010547
Comorbidities              639            0.518412
Smoking_Status             350            0.283950
Encounter Type               0            0.000000
Age_Grp                      0            0.000000
Target                       0            0.000000
SBP below 120                0            0.000000
SBP 120-140                  0            0.000000
SBP above 140                0            0.000000
DBP below 80                 0            0.000000
DBP 80-90                    0            0.000000
DBP above 90                 0            0.000000


# Mapping and Encoding

# 1. Encoding 'Age' column

In [14]:
# One hot encoding on the 'Age_Grp' column
age_group_dummies = pd.get_dummies(data['Age_Grp'], prefix='Age_Grp')

# Concatenate the original DataFrame with the new one hot encoded columns
data = pd.concat([data, age_group_dummies], axis=1)

In [7]:
data.head()

In [8]:
# Drop the original 'Age_Group' column
data = data.drop(columns=['Age_Grp'])

# Display the DataFrame
print(data.head())
data.info()

# 2. Imputation, Mapping and Encoding for 'Smoking_Status' column 

In [17]:
# Print initial count

initial_count = data['Smoking_Status'].value_counts(dropna=False)
print("Initial value counts for 'Smoking_Status' column:")
print(initial_count)

Initial value counts for 'Smoking_Status' column:
Never smoker                                51640
Former smoker quit longer than 12 months    36030
Current every day smoker                    31315
Current some day smoker                      1976
Former smoker quit within 12 months          1475
NaN                                           350
Light tobacco smoker                          292
Heavy tobacco smoker                          121
Smoker ###                                     62
Name: Smoking_Status, dtype: int64


In [18]:
# Impute missing values (with most frequent value) in the 'Smoking_Status' column

data['Smoking_Status'].fillna(data['Smoking_Status'].mode()[0], inplace=True) 

# Verify the changes after imputation
print("\nValue counts for 'Smoking_Status' column after imputation:")
print(data['Smoking_Status'].value_counts(dropna=False))


Value counts for 'Smoking_Status' column after imputation:
Never smoker                                51990
Former smoker quit longer than 12 months    36030
Current every day smoker                    31315
Current some day smoker                      1976
Former smoker quit within 12 months          1475
Light tobacco smoker                          292
Heavy tobacco smoker                          121
Smoker ###                                     62
Name: Smoking_Status, dtype: int64


# Define the Mapping Function of Smoking_Status

In [19]:
# Normalize and clean the string values in 'Smoking_Status'

data['Smoking_Status'] = data['Smoking_Status'].str.replace('#', '').str.strip()

In [20]:
# Define the mapping for encoding

smoking_status_mapping = {
    'Never smoker': 0,
    'Former smoker quit longer than 12 months': 1,
    'Former smoker quit within 12 months': 1,
    'Light tobacco smoker': 1,
    'Current some day smoker': 1,
    'Current every day smoker': 1,
    'Smoker': 1,
    'Heavy tobacco smoker': 1
}

In [9]:
data['Smoking_Status'] = data['Smoking_Status'].map(smoking_status_mapping)

print(data)

In [22]:
# Check the counts of each category after encoding

encoded_counts = data['Smoking_Status'].value_counts()

print(encoded_counts)

1    71271
0    51990
Name: Smoking_Status, dtype: int64


# 3. Mapping and Encoding 'Sex' column

In [23]:
# Check unique values in the 'Sex' column
unique_Sex_Status = data['Sex'].unique()
print(unique_Sex_Status)

# Verify the new columns and their unique values
print(data[['Sex']].head())
print(data['Sex'].value_counts(dropna=False)) 

['F' 'M']
  Sex
0   F
1   F
2   F
3   M
4   F
F    66205
M    57056
Name: Sex, dtype: int64


In [24]:
# Apply one-hot encoding to the 'Sex' column
data = pd.get_dummies(data, columns=['Sex'], dummy_na=False)

In [25]:
# # Verify the one-hot encoded DataFrame
# print(data.head())
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123261 entries, 0 to 123260
Data columns (total 20 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   PATID           123261 non-null  int64 
 1   Race            123255 non-null  object
 2   Marital_Status  123248 non-null  object
 3   Comorbidities   122622 non-null  object
 4   Smoking_Status  123261 non-null  int64 
 5   Encounter Type  123261 non-null  object
 6   Target          123261 non-null  int64 
 7   SBP below 120   123261 non-null  uint8 
 8   SBP 120-140     123261 non-null  uint8 
 9   SBP above 140   123261 non-null  uint8 
 10  DBP below 80    123261 non-null  uint8 
 11  DBP 80-90       123261 non-null  uint8 
 12  DBP above 90    123261 non-null  uint8 
 13  Age_Grp_50-60   123261 non-null  uint8 
 14  Age_Grp_60-70   123261 non-null  uint8 
 15  Age_Grp_70-80   123261 non-null  uint8 
 16  Age_Grp_80-90   123261 non-null  uint8 
 17  Age_Grp_90-100  123261 non-nu

# 4. Imputation, Mapping and Encoding for 'Race' column

In [26]:
# Check unique values in the 'Race' column
print(data['Race'].value_counts(dropna=False)) 

WHITE                                      112900
BLACK OR AFRICAN AMERICAN                    7436
SOME OTHER RACE                              1125
ASIAN                                        1076
Unknown                                       458
AMERICAN INDIAN OR ALASKAN NATIVE             211
NATIVE HAWAIIAN OR OTHER PACIFIC ISLAND        49
NaN                                             6
Name: Race, dtype: int64


In [27]:
# Replace 'Unknown' with NaN
data['Race'] = data['Race'].replace('Unknown', np.nan)

# Verify the changes
print("Value counts for 'Race' column after replacing 'Unknown' with NaN:")
print(data['Race'].value_counts(dropna=False))

Value counts for 'Race' column after replacing 'Unknown' with NaN:
WHITE                                      112900
BLACK OR AFRICAN AMERICAN                    7436
SOME OTHER RACE                              1125
ASIAN                                        1076
NaN                                           464
AMERICAN INDIAN OR ALASKAN NATIVE             211
NATIVE HAWAIIAN OR OTHER PACIFIC ISLAND        49
Name: Race, dtype: int64


In [28]:
# Check the number of missing values in each column
missing_values = data.isnull().sum()

# Calculate the percentage of missing values in each column
percentage_missing = (missing_values / len(data)) * 100

# Print the number of missing values and their percentage
missing_data_summary = pd.DataFrame({'Missing Values': missing_values,'Percentage Missing': percentage_missing})

print(missing_data_summary)

                Missing Values  Percentage Missing
PATID                        0            0.000000
Race                       464            0.376437
Marital_Status              13            0.010547
Comorbidities              639            0.518412
Smoking_Status               0            0.000000
Encounter Type               0            0.000000
Target                       0            0.000000
SBP below 120                0            0.000000
SBP 120-140                  0            0.000000
SBP above 140                0            0.000000
DBP below 80                 0            0.000000
DBP 80-90                    0            0.000000
DBP above 90                 0            0.000000
Age_Grp_50-60                0            0.000000
Age_Grp_60-70                0            0.000000
Age_Grp_70-80                0            0.000000
Age_Grp_80-90                0            0.000000
Age_Grp_90-100               0            0.000000
Sex_F                        0 

In [29]:
# # Check the number of missing values in each column
missing_values = data.isnull().sum()
print(missing_values)

PATID               0
Race              464
Marital_Status     13
Comorbidities     639
Smoking_Status      0
Encounter Type      0
Target              0
SBP below 120       0
SBP 120-140         0
SBP above 140       0
DBP below 80        0
DBP 80-90           0
DBP above 90        0
Age_Grp_50-60       0
Age_Grp_60-70       0
Age_Grp_70-80       0
Age_Grp_80-90       0
Age_Grp_90-100      0
Sex_F               0
Sex_M               0
dtype: int64


In [30]:
# Impute missing values in the 'Race' column

data['Race'].fillna(data['Race'].mode()[0], inplace=True) 

# Verify the changes after imputation
print("\nValue counts for 'Race' column after imputation:")
print(data['Race'].value_counts(dropna=False))


Value counts for 'Race' column after imputation:
WHITE                                      113364
BLACK OR AFRICAN AMERICAN                    7436
SOME OTHER RACE                              1125
ASIAN                                        1076
AMERICAN INDIAN OR ALASKAN NATIVE             211
NATIVE HAWAIIAN OR OTHER PACIFIC ISLAND        49
Name: Race, dtype: int64


# Define the Mapping Function for 'Race'

In [31]:
# Normalize and clean the string and Define the mapping function for 'Race'
# Treat any other unknown or unexpected values as Unknown

def map_race(status):
    status = status.lower().strip()  
    
    if status in ['white', 
                  'black or african american', 
                  'some other race',
                  'asian', 
                  'american indian or alaskan native', 
                  'native hawaiian or other pacific island']:
        return status
    else:
        return 'unknown' 

# Apply the updated function to the 'Race' column
data['Race_Grouped'] = data['Race'].apply(map_race)

# Verify the changes
print(data['Race_Grouped'].value_counts(dropna=False))

# One-hot encoding on the 'Race_Grouped' column
data = pd.get_dummies(data, columns=['Race_Grouped'], prefix='Race')

white                                      113364
black or african american                    7436
some other race                              1125
asian                                        1076
american indian or alaskan native             211
native hawaiian or other pacific island        49
Name: Race_Grouped, dtype: int64


In [10]:
# Check the unique values and their counts in the 'Race' column after mapping
race_counts = data.filter(like='Race_').sum()
print(race_counts)

# Display the dataframe to verify encoding
print(data.head())


In [11]:
# Dictionary mapping old column names to new column names

new_column_names = {
    'Race_american indian or alaskan native': 'AMERICAN_IND/ALASKAN',
    'ace_asian': 'ASIAN',
    'Race_black or african american': 'BLACK/AFRIC_AMERICAN',
    'Race_native hawaiian or other pacific island': 'NAT_HAWAIIN',
    'Race_some other race': 'OTHER',
    'Race_white': 'WHITE',
}

# Rename the columns using the rename method
data.rename(columns=new_column_names, inplace=True)

# Drop the original 'Race' column as it has been encoded
data = data.drop(columns=['Race'], errors='ignore')

# Verify the changes
data.info()

# 5. Imputation, Mapping and Encoding 'Marital_Status' column

In [34]:
# Check unique values in the 'Marital_Status' column
unique_Marital_Status = data['Marital_Status'].unique()
print(unique_Marital_Status)

# Verify the unique values
print(data['Marital_Status'].value_counts(dropna=False)) 

['Married' 'Widowed' 'Divorced' 'Single' 'Separated' 'Unknown' nan
 'Life Partner']
Married         70676
Single          20944
Divorced        15038
Widowed         13494
Unknown          1517
Separated        1500
Life Partner       79
NaN                13
Name: Marital_Status, dtype: int64


In [35]:
# Replace 'Unknown' with NaN in the 'Marital_Status' column
data['Marital_Status'] = data['Marital_Status'].replace('Unknown', np.nan)

# Verify the changes
print("Value counts for 'Marital_Status' column after replacing 'Unknown' with NaN:")
print(data['Marital_Status'].value_counts(dropna=False))

Value counts for 'Marital_Status' column after replacing 'Unknown' with NaN:
Married         70676
Single          20944
Divorced        15038
Widowed         13494
NaN              1530
Separated        1500
Life Partner       79
Name: Marital_Status, dtype: int64


In [36]:
# Impute missing values in the 'Marital_Status' column

mode_value = data['Marital_Status'].mode()[0]
data['Marital_Status'].fillna(mode_value, inplace=True)

# Verify the changes after imputation
print("\nValue counts for 'Marital_Status' column after imputation:")
print(data['Marital_Status'].value_counts(dropna=False))

# One-hot encoding on the 'Marital_Status' column
data = pd.get_dummies(data, columns=['Marital_Status'], prefix='Marital_Status')


Value counts for 'Marital_Status' column after imputation:
Married         72206
Single          20944
Divorced        15038
Widowed         13494
Separated        1500
Life Partner       79
Name: Marital_Status, dtype: int64


In [12]:
# Display the dataframe to verify encoding

print(data.head())
data.info()

In [13]:
# One-hot encoding created new columns for each marital status value. We'll use a dictionary to map old column names to the new ones.


new_column_names = {
    'Marital_Status_Divorced': 'Divorced',
    'Marital_Status_Life Partner': 'Life Partner',
    'Marital_Status_Married': 'Married',
    'Marital_Status_Separated': 'Separated',
    'Marital_Status_Single': 'Single',
    'Marital_Status_Widowed': 'Widowed'
}

# Rename the columns using the rename method
data.rename(columns=new_column_names, inplace=True)


# Verify the changes
print(data.columns)
data.info()

# 6. Mapping and Encoding 'Encounter Type' Column

In [14]:
# Display the value counts for the 'Encounter Type' column

encounter_type_counts = data['Encounter Type'].value_counts()
print("Encounter Type Counts:")
print(encounter_type_counts)

In [15]:
# Split the 'Encounter Type' values into separate columns
split_encounters = data['Encounter Type'].str.get_dummies(sep=',')

# Concatenate the original DataFrame with the new one hot encoded columns
data = pd.concat([data, split_encounters], axis=1)

# Drop the original 'Encounter Type' column
data = data.drop(columns=['Encounter Type'])

# Display the updated DataFrame
print("Updated DataFrame with One Hot Encoding for 'Encounter Type':")
print(data.head())


In [16]:
# Display the new one hot encoded columns

encoded_columns = split_encounters.columns
print("One Hot Encoded Columns for 'Encounter Type':")
print(data[encoded_columns].head())

data.info()

# Rename the columns 'EMERGENCY' to 'EcType_ED', 'INPATIENT' to 'EcType_IP', and 'OUTPATIENT' to 'EcType_AV'

In [17]:
# Rename the specified columns

data = data.rename(columns={
    'EMERGENCY': 'EcType_ED',
    'INPATIENT': 'EcType_IP',
    'OUTPATIENT': 'EcType_AV'
})

# Display the DataFrame
print(data.info())

print(data.head())


In [18]:
data.info()

# 7. Imputing, Mapping and Encoding for 'Comorbidities' Column

In [19]:
# # Check the number of missing values in each column
missing_values = data.isnull().sum()
print(missing_values)

In [20]:
# Check unique values in the 'Comorbidities' column

unique_Comorbidities = data['Comorbidities'].unique()
print(unique_Comorbidities)

# Verify the new columns and their unique values
#print(data[['Comorbidities']].head())
print(data['Comorbidities'].value_counts(dropna=False)) 

In [46]:
# List of specific diseases that we want to encode

diseases_to_encode = [
    'Diabetes', 'Type 2 Diabetes Mellitus', 'Epilepsy', 'Depression', 'Obesity', 'Stroke', 'Anxiety', 'Hypertension',
    'Hyperlipidemia', 'Cardiovascular Disease', 'Sleep Disorder', 'Headache', 'Periodontitis', 'Concussion',
    'Heart Disease', 'Sleep Apnea', 'Insomnia', 'Kidney Disease', 'Cholesterol', 'Vitamin D Deficiency',
    'Enlarge Prostate', 'Osteoporosis', 'Bone Disease', 'Depressive Disorder'
]

# Define the function to create a new column for each disease

def map_comorbidities(comorbidities, disease):
    if pd.isna(comorbidities):
        return 0
    return 1 if disease.lower() in comorbidities.lower() else 0

# Create new columns for each disease

for disease in diseases_to_encode:
    data[disease.replace(' ', '_')] = data['Comorbidities'].apply(lambda x: map_comorbidities(x, disease))

In [21]:
# Check the count of each newly created disease column

for disease in diseases_to_encode:
    column_name = disease.replace(' ', '_')
    count = data[column_name].sum()
    print(f"Count of {disease}: {count}")

In [22]:
# Display the dataframe to verify encoding

print(data.head())
data.info()

In [23]:
# Verify the changes
print(data.head())

# Drop the original 'Comorbidities' and 'Osteoporosis' columns after encoding
data.drop(columns=['Comorbidities', 'Osteoporosis'], inplace=True)

# Display the dataframe
print(data.head())
data.info()

In [24]:
# Move the 'Targer' column to the end
target_col = data.pop('Target')
data['Target'] = target_col

# Print the DataFrame info to verify the changes
print(data.info())
print(data.head())


# Save the Preprocessing DataFrame to a CSV File

In [None]:
# Save the DataFrame to a CSV file for ML analysis

data.to_csv('Combined_ML_3Yrs_ML_Analysis.csv', index=False)