### Modules and Library Importations

In [None]:

import os  # For interacting with the operating system
import numpy as np  # For numerical operations
import pygal  # For generating interactive charts
import pickle  # For serializing and deserializing Python objects
from prettytable import PrettyTable  # For displaying tabular data in a visually appealing ASCII format
from sklearn.model_selection import train_test_split, GridSearchCV  # For splitting data and performing grid search
from sklearn.preprocessing import StandardScaler  # For standardizing features
from sklearn.linear_model import LogisticRegression  # For logistic regression modeling
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score  # For model evaluation metrics
import pandas as pd  # For data manipulation and analysis
import seaborn as sns  # For statistical data visualization
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
from wtforms import fields  # For creating web forms
import altair as alt  # For declarative statistical visualization library
from wtforms import SelectField
import plotly.graph_objs as go
import plotly.express as px

### Data Loading

In [None]:
# Load the dataset
current_dir = os.getcwd()
train_data_path = os.path.join(current_dir, 'datasets', 'Paitients_Files_Train.csv')
test_data_path = os.path.join(current_dir, 'datasets', 'Paitients_Files_Test.csv')

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)


### Data Overview

In [None]:
train_data.head(10)

In [None]:

train_data.info()

In [None]:
# A quick look at the shape of our dataset
print("Shape of Training Dataset:", train_data.shape)

In [None]:
# Looking at the head of our dataset
print("Head of Training Dataset:")
print(train_data.head())


In [None]:
# Description of Columns
column_description = {
    'ID': 'Unique number to represent patient ID',
    'PRG': 'Plasma glucose',
    'PL': 'Blood Work Result-1 (mu U/ml)',
    'PR': 'Blood Pressure (mm Hg)',
    'SK': 'Blood Work Result-2 (mm)',
    'TS': 'Blood Work Result-3 (mu U/ml)',
    'M11': 'Body mass index (weight in kg/(height in m)^2)',
    'BD2': 'Blood Work Result-4 (mu U/ml)',
    'Age': 'Patients age (years)',
    'Insurance': 'If a patient holds a valid insurance card',
    'Sepssis': 'Target: Positive if a patient in ICU will develop sepsis, Negative otherwise'
}
print("Description of Columns:")
for column, description in column_description.items():
    print(f"{column}: {description}")

In [None]:
# Look at the columns in the dataset and their data types
print("Information about Training Dataset:")
print(train_data.info())

In [None]:
# Get more details about the features of our data
print("Description of Training Dataset:")
print(train_data.describe())

In [None]:
# Check for missing values
print("Missing Values in Training Dataset:")
print(train_data.isna().sum())

In [None]:
# Check for duplicates
duplicates = train_data.duplicated()
print("Number of duplicates:", duplicates.sum())
print("Duplicate rows:")
print(train_data[duplicates])

In [None]:
# Display unique values for each column
for column in train_data.columns:
    unique_values = train_data[column].unique()
    print(f"Unique values in {column}: {unique_values}")

In [None]:
# Calculate outliers
outliers = {}

for col in train_data.columns:
    if train_data[col].dtype != 'object':
        Q1 = train_data[col].quantile(0.25)
        Q3 = train_data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        col_outliers = train_data[(train_data[col] < lower_bound) | (train_data[col] > upper_bound)][col].tolist()
        outliers[col] = col_outliers

# Display outliers
for col, col_outliers in outliers.items():
    print(f"Outliers in {col}: {col_outliers}")


In [None]:
train_data.boxplot()

## Here are the issues identified in the dataset and potential solutions:

### Data Types:

- The 'ID' column is of type object, which is suitable for unique identifiers.
- The 'Sepssis' column is also of type object, which suggests it might contain categorical data.
- There is an excess of zero values in every column.
- The target variable 'Sepsis' may exhibit imbalanced class distribution.
- Several numerical columns contain numerous outliers.
- Potential correlations among predictor variables may lead to multicollinearity.

### Outliers:

Outliers are detected in several numerical columns like 'PRG', 'PR', 'SK', 'TS', 'M11', 'BD2', and 'Age'. Outliers can significantly affect model performance and need to be addressed. One approach could be to cap or clip the outliers to a certain threshold, or using techniques like Winsorization to replace them with more reasonable values.

### Missing Values:

No missing values are reported in the dataset, which is a good sign. However, it's essential to verify if missing values are encoded differently, such as using placeholder values like '-1' or '999'.

### Duplicates:

No duplicate rows are reported, which is also a positive finding. Duplicate rows, if present, need to be removed to prevent bias in the analysis.

### Categorical Data Encoding:

The 'Sepssis' column appears to contain categorical data. If it's binary (yes/no), it could be encoded as 0 and 1 for no and yes, respectively, or using one-hot encoding if there are more than two categories.

### Column Names:

The column names are not very descriptive. It's recommended to rename the columns to more meaningful names for better clarity and interpretability.

### Data Distribution:

It's essential to visualize the distribution of each numerical feature to understand the data better and identify any skewness or anomalies that may require further investigation.

### Scaling:

Depending on the algorithms to be used, it might be necessary to scale the numerical features to ensure they contribute equally to the model fitting process.

### Data Interpretation:

Understanding the context of the data is crucial. Domain knowledge can help in interpreting the features correctly and making informed decisions during data preprocessing and analysis.

By addressing these issues, the dataset can be prepared for further analysis and modeling, ensuring better performance and more reliable results.

### Hypothesis:
- Null Hypothesis: There is no association between the presence of comorbidities and the likelihood of developing Sepsis among patients in the dataset.
- Alternate Hypothesis: Patients with underlying comorbidities are more likely to develop Sepsis compared to those without comorbidities within the dataset.

### Analytical Questions:
1. What is the prevalence of comorbidities among patients diagnosed with Sepsis compared to those without Sepsis?
2. Are specific comorbidities more commonly observed among patients diagnosed with Sepsis?
3. How does the distribution of comorbidities vary across different age groups in the dataset?
4. What are the demographic characteristics (e.g., age, insurance status) of patients with and without comorbidities?
5. Is there a correlation between the number of comorbidities a patient has and the likelihood of developing Sepsis within the dataset?
6. Do patients with specific comorbidities exhibit a higher risk of developing severe forms of Sepsis?
7. How do the presence of comorbidities impact the prognosis and outcomes of Sepsis patients within the dataset?

## Data Cleaning

In [None]:
# Assuming your DataFrame is named 'data'
train_data.drop(columns=['ID'], inplace=True)


In [None]:
# First Rename the columns
train_data = train_data.rename(columns={
    "PRG": "Plasma_glucose",
    "PL": "Blood_Work_R1",
    "PR": "Blood_Pressure",
    "SK": "Blood_Work_R2",
    "TS": "Blood_Work_R3",
    "M11": "BMI",
    "BD2": "Blood_Work_R4",
    "Age": "Patient_age",
    "Sepssis": "Target_Sepsis"
})

# Replace zeros in each column with the median value
columns_with_zeros = ['Plasma_glucose', 'Blood_Work_R1', 'Blood_Pressure', 'Blood_Work_R2', 'Blood_Work_R3', 'BMI', 'Blood_Work_R4']
for col in columns_with_zeros:
    median_val = train_data[col].median()
    train_data[col] = train_data[col].replace(0, median_val)

In [None]:
train_data

In [None]:
numeric_columns = train_data.select_dtypes(include=[np.number]).columns
num_df = train_data[numeric_columns]

Q1 = num_df.quantile(0.25)
Q3 = num_df.quantile(0.75)
IQR = Q3 - Q1

outliers = ((num_df < (Q1 - 1.5 * IQR)) | (num_df > (Q3 + 1.5 * IQR))).any()

outliers_df = outliers.to_frame().T
print(outliers_df)

All the numeric_columns except Insurance have outliers.

In [None]:
# Calculate Q1, Q3, and IQR
Q1 = num_df.quantile(0.25)
Q3 = num_df.quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the DataFrame to remove outliers
cleaned_df = num_df[~((num_df < lower_bound) | (num_df > upper_bound)).any(axis=1)]

# Print the shape of the cleaned DataFrame
print("Shape of cleaned DataFrame:", cleaned_df.shape)

In [None]:
plt.figure(figsize=(10, 6))

# Plot the boxplot
train_data.boxplot()

# Rotate x-axis labels by 45 degrees
plt.xticks(rotation=45)

# Display the plot
plt.show()

#### Univariate Analysis

In [None]:
# Set the style for the plot
sns.set(style="ticks", color_codes=True)

# Create a grid of 3 by 3 subplots
fig, axes = plt.subplots(3, 3, figsize=(12, 12))

# Flatten the axes array
axes = axes.flatten()

# Plot histograms for each numerical column
for i, col in enumerate(cleaned_df.columns):
    sns.histplot(cleaned_df[col], kde=True, ax=axes[i])
    axes[i].set_title(col)

plt.tight_layout()
plt.show()


In [None]:
train_data.head(10)

In [None]:
# Convert 'Target_Sepsis' column to boolean values
train_data['Target_Sepsis'] = train_data['Target_Sepsis'].map({'Positive': True, 'Negative': False})

In [None]:
# Derive comorbidity count based on the presence of comorbidities
train_data['Comorbidity_Count'] = train_data[['Plasma_glucose', 'Blood_Work_R1', 'Blood_Pressure', 
                                              'Blood_Work_R2', 'Blood_Work_R3', 'BMI', 
                                              'Blood_Work_R4']].gt(0).sum(axis=1)

In [None]:
train_data.info()

In [None]:

# Create separate dataframes for positive and negative cases
positive_cases = train_data[train_data['Target_Sepsis']]
negative_cases = train_data[~train_data['Target_Sepsis']]


In [None]:
# Print columns for positive cases
print("Columns for positive cases:")
print(positive_cases)

In [None]:

# Print columns for negative cases
print("\nColumns for negative cases:")
print(negative_cases[['Plasma_glucose', 'Blood_Work_R1', 'Blood_Pressure', 'Blood_Work_R2', 'Blood_Work_R3']])


In [None]:
# Distribution of comorbidities among positive cases
positive_comorbidities = positive_cases.iloc[:, [1, 2, 3, 4, 5]]
positive_comorbidities_describe = positive_comorbidities.describe()

# Create a PrettyTable instance
table = PrettyTable()

# Set column names
table.field_names = positive_comorbidities_describe.columns

# Add data rows to the table
for row in positive_comorbidities_describe.itertuples(index=False):
    table.add_row(row)

# Display the table
print("Distribution of comorbidities among positive cases:")
print(table)

In [None]:
#Distribution of comorbidities among negative cases
negative_comorbidities = negative_cases.iloc[:, [1, 2, 3, 4, 5]]
negative_comorbidities_describe = negative_comorbidities.describe()

# Create a PrettyTable instance
table = PrettyTable()

# Set column names
table.field_names = negative_comorbidities_describe.columns

# Add data rows to the table
for row in negative_comorbidities_describe.itertuples(index=False):
    table.add_row(row)

# Display the table
print("Distribution of comorbidities among negative cases:")
print(table)

In [None]:
# Distribution of comorbidities by age group
plt.figure(figsize=(12, 8))
sns.boxplot(x='Patient_age', y='Comorbidity_Count', hue='Target_Sepsis', data=train_data)
plt.title('Distribution of Comorbidities by Age Group and Sepsis Diagnosis')
plt.xlabel('Age Group')
plt.ylabel('Comorbidity Count')
plt.legend(title='Sepsis Diagnosis', loc='upper right')
plt.show()

In [None]:
# Analyze demographic characteristics of patients with and without comorbidities
plt.figure(figsize=(12, 6))
sns.countplot(x='Insurance', hue='Comorbidity_Count', data=train_data)
plt.title('Insurance Status of Patients with Different Comorbidity Counts')
plt.xlabel('Insurance Status')
plt.ylabel('Count')
plt.legend(title='Comorbidity Count', loc='upper right')
plt.show()


#### Bivariate analysis

In [None]:
# Compare prevalence of comorbidities among positive and negative cases
fig, axes = plt.subplots(2, 5, figsize=(20, 8))

for i, col in enumerate(positive_comorbidities.columns):
    sns.histplot(positive_comorbidities[col], kde=True, ax=axes[0, i], color='blue', label='Sepsis')
    sns.histplot(negative_comorbidities[col], kde=True, ax=axes[1, i], color='red', label='No Sepsis')
    axes[0, i].set_title(f'{col} Distribution (Sepsis)')
    axes[1, i].set_title(f'{col} Distribution (No Sepsis)')
    axes[0, i].legend()
    axes[1, i].legend()

plt.tight_layout()
plt.show()

In [None]:
# Pairplot
sns.pairplot(cleaned_df)
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(cleaned_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Calculate mean comorbidity count for each Sepsis diagnosis category
sepsis_comorbidity_mean = train_data.groupby('Target_Sepsis')['Comorbidity_Count'].mean()

# Create a PrettyTable object
table = PrettyTable()

# Define table columns
table.field_names = ["Sepsis Diagnosis", "Mean Comorbidity Count"]

# Add rows to the table
for sepsis_status, mean_count in sepsis_comorbidity_mean.items():
    table.add_row([sepsis_status, mean_count])

# Set alignment for columns
table.align["Sepsis Diagnosis"] = "l"
table.align["Mean Comorbidity Count"] = "r"

# Print the table
print(table)


In [None]:
# Analyze the correlation between comorbidities count and Sepsis diagnosis
plt.figure(figsize=(8, 6))
sns.boxplot(x='Target_Sepsis', y='Comorbidity_Count', data=train_data)
plt.title('Association between Comorbidity Count and Sepsis Diagnosis')
plt.xlabel('Sepsis Diagnosis')
plt.ylabel('Comorbidity Count')
plt.show()