In [4]:
# Step 1: Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from scipy.stats import ttest_ind

# Step 2: Load the dataset
df = pd.read_csv('Day_15_Healthcare_Data.csv')

# Strip any leading/trailing spaces in column names
df.columns = df.columns.str.strip()

# Step 3: Initial exploration of the dataset
print("Initial Dataset Information:")
df.info()  # Display info of the dataset (column names, non-null count, dtypes)

print("First Few Rows of Dataset:")
print(df.head())  # View the first few rows to get an idea of the data structure

# Ensure that the 'age' column exists
if 'age' not in df.columns:
    print("Column 'age' is missing from the dataset")
else:
    # Step 4: Identifying missing data
    missing_data = df.isna().sum()  # Count missing values per column
    missing_percentage = df.isna().mean() * 100  # Percentage of missing values per column

    print("Missing Values Count:")
    print(missing_data)

    print("Percentage of Missing Values:")
    print(missing_percentage)

    # Step 5: Visualizing missing data pattern using a heatmap
    plt.figure(figsize=(10, 7))
    sns.heatmap(df.isna(), cbar=False, cmap='viridis')
    plt.title("Missing Data Heatmap")
    plt.show()

    # Step 6: Imputation techniques

    ## 6.1: Mean/Median/Mode Imputation for numerical columns
    df['age'] = df['age'].fillna(df['age'].median())  # Using median for 'age' (if skewed)
    df['bmi'] = df['bmi'].fillna(df['bmi'].mean())  # Using mean for 'bmi'
    df['blood_pressure'] = df['blood_pressure'].fillna(df['blood_pressure'].median())  # Example for other numerical columns

    ## 6.2: Mode Imputation for categorical columns
    df['sex'] = df['sex'].fillna(df['sex'].mode()[0])  # Mode imputation for categorical columns (sex)

    # Step 7: K-Nearest Neighbors (KNN) Imputation
    df_knn_imputed = df.copy()  # Make a copy of the dataframe to apply KNN
    knn_imputer = KNNImputer(n_neighbors=5)
    # Ensure we only apply KNN to numeric columns
    numeric_columns = df_knn_imputed.select_dtypes(include=['float64', 'int64']).columns
    df_knn_imputed[numeric_columns] = knn_imputer.fit_transform(df_knn_imputed[numeric_columns])

    # Step 8: Regression Imputation (if applicable)
    # For regression imputation, assume 'age' is missing and 'bmi', 'sex' will be used to predict it.
    # First, drop rows with missing 'age' for training purposes.
    df_train = df.dropna(subset=['age'])
    X_train = df_train[['bmi', 'sex']]  # Predictor variables (you can use more features if applicable)
    y_train = df_train['age']  # Target variable (age)

    # Encode 'sex' as a numerical variable (one-hot encoding)
    X_train = pd.get_dummies(X_train, drop_first=True)

    # Fit a regression model
    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)

    # Now, predict missing values for 'age'
    df_missing = df[df['age'].isna()]
    X_missing = df_missing[['bmi', 'sex']]
    X_missing = pd.get_dummies(X_missing, drop_first=True)  # Ensure the same encoding
    df.loc[df['age'].isna(), 'age'] = regression_model.predict(X_missing)

    # Step 9: Compare results of different imputation methods
    print("Summary Statistics Before Imputation:")
    print(df.describe())

    # For KNN Imputed Data
    print("Summary Statistics After KNN Imputation:")
    print(df_knn_imputed.describe())

    # Step 10: Evaluate the effect of imputation using t-test (example for 'age')
    original_age = df['age'].dropna()
    imputed_age_knn = df_knn_imputed['age'].dropna()

    # Perform t-test to compare original and KNN imputed 'age'
    t_stat, p_value = ttest_ind(original_age, imputed_age_knn)
    print(f"T-statistic: {t_stat}, P-value: {p_value}")

    # Step 11: Visualizing the impact of imputation with boxplots
    plt.figure(figsize=(10, 5))

    # Plotting boxplot for original 'age'
    plt.subplot(1, 2, 1)
    sns.boxplot(x=df['age'])
    plt.title("Original Age Distribution")

    # Plotting boxplot for KNN imputed 'age'
    plt.subplot(1, 2, 2)
    sns.boxplot(x=df_knn_imputed['age'])
    plt.title("KNN Imputed Age Distribution")

    plt.tight_layout()
    plt.show()

    # Step 12: Visualizing Histograms to compare distributions
    plt.figure(figsize=(10, 5))

    # Original Age Distribution
    sns.histplot(df['age'], kde=True, color='blue', label='Original', stat='density')

    # KNN Imputed Age Distribution
    sns.histplot(df_knn_imputed['age'], kde=True, color='red', label='KNN Imputed', stat='density')

    plt.legend()
    plt.title("Age Distribution Before and After KNN Imputation")
    plt.show()


Initial Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Patient_ID      505 non-null    int64  
 1   Age             505 non-null    int64  
 2   Gender          484 non-null    object 
 3   Blood_Pressure  475 non-null    float64
 4   Cholesterol     485 non-null    float64
 5   Diabetes        505 non-null    object 
 6   Heart_Disease   505 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 27.7+ KB
First Few Rows of Dataset:
   Patient_ID  Age  Gender  Blood_Pressure  Cholesterol Diabetes Heart_Disease
0           1   69    Male            95.0        122.0       No            No
1           2   32    Male           129.0        191.0       No            No
2           3   89  Female           101.0        214.0       No            No
3           4   78  Female           142.0        203.0 