In [91]:
import os
import utils
import dotenv
import umap
import pandas as pd
import numpy as np
import dataextraction as db
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN, SpectralClustering, KMeans
import hdbscan

In [92]:
# Loads the environment variables from the .env file
dotenv.load_dotenv()

True

In [93]:
# Connect to database
conn = db.connect_to_database()
cur = conn.cursor()
df = db.get_base_dataset(conn)

Connected to the PostgreSQL database
PostgreSQL version: PostgreSQL 14.0, compiled by Visual C++ build 1914, 64-bit


In [94]:
# Number of duplicate subjects
df['subject_id'].duplicated().sum()

11976

In [95]:
# Get only first ICU admissions
df = df[df['first_icu_stay'] == True]

In [96]:
# Check duplicate subject_id's again
df['subject_id'].duplicated().sum()

9321

In [97]:
# Aggregate missing mean columns by taking average of minimum and maximum
missing_mean_columns = utils.detect_missing_mean_columns(df)
df = utils.add_missing_mean_columns(df, missing_mean_columns)

KeyboardInterrupt: 

In [None]:
# Then aggregate rest of the mean columns by taking average of the subject_id's multiple mean values
mean_columns = [col for col in df.columns if '_mean' in col]
aggregation_functions = {}
for col in mean_columns:
    aggregation_functions[col] = 'mean'
aggregation_functions

In [None]:
# Aggregated dataframe for later merging it with the original dataframe
df_aggregated = df.groupby('subject_id').agg(aggregation_functions).reset_index()

In [None]:
# Printing out the aggregated dataframe
df_aggregated

In [None]:
# Number of duplicate subject_id's in aggregated dataframe
df_aggregated.duplicated().sum()

In [None]:
# Merging the aggregated dataframe with the original dataframe on subject_id by excluding minimum and maximum and mean columns since
# df_aggregated includes unique subject_id's mean (aggregated measurements)
other_columns = [col for col in df.columns if('_mean' not in col and '_min' not in col and '_max' not in col)]
other_columns

In [None]:
# Grouping by subject_id and taking the first value of each column
df_other_columns = df.groupby('subject_id', as_index=False)[other_columns].first()

In [None]:
# Merging it with the rest of the dataframe on subject_id (since we grouped by first we only left with unique
# subject_id's measurements with respect to df_aggregated)
df = pd.merge(df_aggregated, df_other_columns, on='subject_id')

In [None]:
# Checking out duplicate subject_id's to make sure we have unique subject_id's
df['subject_id'].duplicated().sum()

In [None]:
# Check out glucose_mean column to see if it is aggregated correctly
df['glucose_mean']

In [None]:
# Printing out the record of subject id that equals to 3 \
df[df["subject_id"] == 3]

In [None]:
# Focusing on column types for possible encoding of the categorical columns and possible
# columns that could be removed because it is irrelevant to the prediction
df.dtypes

In [None]:
# Detecting the datetime columns
for key, val in df.dtypes.to_dict().items():
    if('date' in str(val)):
        print(key, val)

In [None]:
# Dropping the detected datetime columns
columns_to_remove = ['intime', 'outtime', 'dod', 'admittime', 'dischtime', 'deathtime', 'edregtime', 'edouttime']
df = df.drop(columns_to_remove, axis=1)

In [None]:
# Checking out the dataframe size if the columns are dropped correctly
df

In [None]:
# Detecting columns with the null values that is above 90% of the total dataframe size
columns_to_remove = []
for key, value in df.isnull().sum().to_dict().items():
    if(((value / df.shape[0]) * 100) > 50):
        print(key, value)
        columns_to_remove.append(key)

In [None]:
# Dropping the detected columns with the null values that is above 90% of the total dataframe size
df = df.drop(columns_to_remove, axis=1)

In [None]:
# Checking out dataframe size if the columns are dropped correctly
df

In [None]:
# Detecting the categorical columns 
for key, val in df.dtypes.to_dict().items():
    if('object' in str(val)):
        print(key, val)

In [None]:
# Binary encoding the gender columns
df['gender'] = df['gender'].map({'M': 0, 'F': 1})
df['gender']

In [None]:
# Frequency mapping diagnosis column since it has too many unique values
frequency_mapping = df['diagnosis'].value_counts(normalize=True)
df['diagnosis_encoded'] = df['diagnosis'].map(frequency_mapping)
df['diagnosis_encoded']

In [None]:
# For rest of the cateogircal columns we will use one hot encoding
categorical_cols = ['marital_status', 'ethnicity_grouped',
                    'first_careunit', 'last_careunit', 'admission_type',
                    'admission_location', 'discharge_location', 'insurance']
df = pd.get_dummies(df, columns=categorical_cols)

In [None]:
# Dropping rest of the unecessary columns 
columns_to_remove = ['language', 'religion', 'diagnosis', 'ethnicity', 'dbsource']
df = df.drop(columns_to_remove, axis=1)

In [None]:
for key, val in df.dtypes.to_dict().items():
    if('object' in str(val)):
        print(key, val)

In [None]:
df

In [None]:
# Saving dataset for use in eda later on
# df.to_csv(f'{os.getenv("ROOT_DIR")}\\data\\final.csv', index=False)

In [None]:
# Detecting columns with the null values that is above 90% of the total dataframe size
for key, value in df.isnull().sum().to_dict().items():   
        print(key, value)

In [None]:
df = df.dropna()
df

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

In [None]:
pca = PCA(n_components=20)
pca_result = pca.fit_transform(scaled_data)

In [None]:
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(pca_result)

In [None]:
umap_reducer = umap.UMAP(n_components=2, random_state=42)
umap_result = umap_reducer.fit_transform(pca_result)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c='b', marker='o', label='t-SNE')
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(umap_result[:, 0], umap_result[:, 1], c='r', marker='s', label='UMAP')
plt.title('UMAP Visualization')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.legend()
plt.show()

In [None]:
n_clusters = 5

gmm = GaussianMixture(n_components=n_clusters, random_state=42)
gmm_labels = gmm.fit_predict(umap_result)

df['GMM_Cluster_UMAP'] = gmm_labels

plt.scatter(umap_result[:, 0], umap_result[:, 1], c=gmm_labels, cmap='viridis', alpha=0.7)
plt.colorbar()
plt.title('GMM Clustering using t-SNE')
plt.show()

In [None]:
eps = 0.8
min_samples = 5

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan_labels = dbscan.fit_predict(umap_result)

df['DBSCAN_Cluster'] = dbscan_labels

plt.scatter(umap_result[:, 0], umap_result[:, 1], c=dbscan_labels, cmap='viridis', alpha=0.7)
plt.colorbar()
plt.title('DBSCAN Clustering using t-SNE')
plt.show()

In [None]:
min_cluster_size = 5

hdb = hdbscan.HDBSCAN(min_samples=min_cluster_size)
hdb_labels = hdb.fit_predict(umap_result)

df['HDBSCAN_Cluster'] = hdb_labels

plt.scatter(umap_result[:, 0], umap_result[:, 1], c=hdb_labels, cmap='viridis', alpha=0.7)
plt.colorbar()
plt.title('HDBSCAN Clustering using t-SNE')
plt.show()

In [None]:
n_clusters = 5

spectral_clustering = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', random_state=42)
spectral_labels = spectral_clustering.fit_predict(umap_result)

df['Spectral_Cluster'] = spectral_labels

plt.scatter(umap_result[:, 0], umap_result[:, 1], c=spectral_labels, cmap='viridis', alpha=0.7)
plt.colorbar()
plt.title('Spectral Clustering using t-SNE')
plt.show()