### Statistical Analysis & Understanding Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import (
    encode_features,
    scale_features,
    look_for_outliers,
    handle_outliers_iqr,
    treat_skewness,
)

# from google.colab import drive

In [None]:
# drive.mount('/content/drive')

In [None]:
df = pd.read_csv("data/train data.csv")
df.head()

In [None]:
df.tail()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")

In [None]:
#Summarize categorical columns:
for col in df.select_dtypes(include='object').columns:
    print(f"Value counts for {col}:\n{df[col].value_counts()}\n")

In [None]:
for col in df.columns:
        num_unique = df[col].nunique()
        print(f"Column '{col}' has {num_unique} unique values.")

In [None]:
print(f"Number of duplicate rows: {df.duplicated().sum()}")
# View the duplicated rows
duplicated_rows = df[df.duplicated()]
print("\nDuplicated rows:")
print(duplicated_rows)

In [None]:
# 1. Descriptive statistics for numeric columns
numeric_desc = df.describe()
print("Descriptive Statistics (Numeric Columns):")
print(numeric_desc)

In [None]:
# 2. Frequency distribution for top 10 values in categorical variables
categorical_cols = df.select_dtypes(include='object').columns
print("\nFrequency Distribution (Top 10 per Categorical Column):")
for col in categorical_cols:
    print(f"\nTop values in '{col}':")
    print(df[col].value_counts().head(10))

In [None]:
# 3. Correlation matrix for numerical features
print("\nCorrelation Matrix:")
correlation_matrix = df.corr(numeric_only=True)
print(correlation_matrix)

In [None]:
# 4. Distribution plots (Histograms) for numerical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns

plt.figure(figsize=(15, 12))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(3, 3, i)
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Histogram: {col}')
plt.tight_layout()
plt.show()

In [None]:
# 5. Boxplots for outlier detection
plt.figure(figsize=(15, 12))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot: {col}')
plt.tight_layout()
plt.show()

In [None]:
# 6. Bar charts for top categories
plt.figure(figsize=(18, 18))
for i, col in enumerate(categorical_cols[:9], 1):  # Limit to 9 for layout
    plt.subplot(3, 3, i)
    df[col].value_counts().head(10).plot(kind='bar')
    plt.title(f'Bar Chart: {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Preprocessing

In [None]:
# Get missing values count and percentage
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

# Get column data types
column_types = df.dtypes

# Combine into one DataFrame
missing_data = pd.DataFrame({
    'Column Type': column_types,
    'Missing Values': missing_values,
    'Percentage (%)': missing_percentage
})

# Filter only columns with missing values
missing_data = missing_data[missing_data['Missing Values'] > 0]

# Display the result
print(missing_data)

In [None]:
print(df.columns)

In [None]:
# Categorical columns: use mode or "Unknown"
categorical_cols = ['Blood Type', 'Doctor', 'Hospital', 'Insurance Provider', 'Admission Type']
for col in categorical_cols:
    if df[col].isnull().mean() > 0.05:
        df[col].fillna("Unknown", inplace=True)  # For higher missing %
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Numerical columns: use median
df['Billing Amount'].fillna(df['Billing Amount'].median(), inplace=True)

In [None]:
# Get missing values count and percentage
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

# Get column data types
column_types = df.dtypes

# Combine into one DataFrame
missing_data = pd.DataFrame({
    'Column Type': column_types,
    'Missing Values': missing_values,
    'Percentage (%)': missing_percentage
})

# Filter only columns with missing values
missing_data = missing_data[missing_data['Missing Values'] > 0]

# Display the result
print(missing_data)

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
# Step 2: Standardize text-based categorical fields
df['Name'] = df['Name'].str.title()
df['Gender'] = df['Gender'].str.capitalize()
df['Medical Condition'] = df['Medical Condition'].str.capitalize()
df['Doctor'] = df['Doctor'].str.title()
df['Hospital'] = df['Hospital'].str.title()
df['Insurance Provider'] = df['Insurance Provider'].str.title()
df['Medication'] = df['Medication'].str.capitalize()
df['Test Results'] = df['Test Results'].str.capitalize()
df['Admission Type'] = df['Admission Type'].str.capitalize()


In [None]:
# Step 3: Convert date columns to datetime
df["Date of Admission"] = pd.to_datetime(
    df["Date of Admission"], errors="coerce", dayfirst=True
)
df["Discharge Date"] = pd.to_datetime(
    df["Discharge Date"], errors="coerce", dayfirst=True
)

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
# Calculate Length of Stay
df['Length of Stay in Days'] = (df['Discharge Date'] - df['Date of Admission']).dt.days

# Reorder columns: insert 'Length of Stay' before 'Target'
target_index = df.columns.get_loc('Test Results')
cols = list(df.columns)
# Move 'Length of Stay' to the position before 'Target'
cols.insert(target_index, cols.pop(cols.index('Length of Stay in Days')))
df = df[cols]

In [None]:
df.head()

In [None]:
#Checking the percentage of the missing data
pd.set_option('display.max_rows', None)
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({'Feature': df.columns, 'Missing Percentage': missing_percentage})
print(missing_df)

In [None]:
df.duplicated().sum()

In [None]:
look_for_outliers(df)

In [None]:
# List of numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Apply the function to handle outliers
df = handle_outliers_iqr(df, numerical_features)

In [None]:
look_for_outliers(df)

In [None]:
pd.set_option('display.max_rows', None)  #this line to show all of the records
df_dtypes =pd.DataFrame({"Feature": df.columns, "Data Type": df.dtypes})
print(df_dtypes)
pd.reset_option('display.max_rows')

In [None]:
# Drop ID and Name
df.drop(columns=['ID', 'Name', 'Room Number'], inplace=True)

X_encoded, y_encoded = encode_features(df, target_col="Test Results")
df = pd.concat([X_encoded, y_encoded.rename("Test Results")], axis=1)

In [None]:
pd.set_option('display.max_rows', None)  #this line to show all of the records
df_dtypes = pd.DataFrame({"Feature": df.columns, "Data Type": df.dtypes})
print(df_dtypes)
pd.reset_option('display.max_rows')

In [None]:
df.head()

In [None]:
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_features = [col for col in numerical_features if col != 'Test Results']

df, skewed_features, transformation_details = treat_skewness(df, numerical_features)

In [None]:
df, scaled_cols = scale_features(df, target_col='Test Results', scaler_type='standard')

In [None]:
numeric_desc = df.describe()
print("Descriptive Statistics (Numeric Columns):")
print(numeric_desc)

In [None]:
df.info()

In [None]:
print(df.columns.tolist())

Statistical Analysis after cleaning

In [None]:
# Histograms for numerical features
df[numerical_cols].hist(figsize=(16, 12), bins=30, color='skyblue', edgecolor='black')
plt.suptitle('Histograms of Numerical Features', fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(18, 18))
for i, col in enumerate(categorical_cols[:9], 1):  # Limit to 9 for layout
    plt.subplot(3, 3, i)
    df[col].value_counts().head(10).plot(kind='bar')
    plt.title(f'Bar Chart: {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#encoding cateogorical feautures then display confusion matrix

# Make a copy of the DataFrame
df_encoded = df.copy()

# Encode categorical columns
categorical_cols = df_encoded.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

le = LabelEncoder()
for col in categorical_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

# Plot the colorful heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(df_encoded.corr(), 
            annot=True, 
            cmap='hsv', 
            fmt='.3f', 
            linewidths=2)

plt.title('Correlation Heatmap (Including Encoded Categorical Features)', fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
def get_correlation_table(df):
    # Set display options to show all rows and columns

    # Calculate the correlation matrix
    correlation_matrix = df.corr()

    # Extract correlations with the target variable 'Test Results'
    table = correlation_matrix['Test Results']

    # pd.set_option('display.max_rows', None)

    # Print the table
    return table


get_correlation_table(df_encoded)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Replace these with actual column names from your dataset
feature_col = 'Age'      # Any categorical or discrete numeric feature
target_col = 'Test Results'    # Your class label column

# Check the columns exist
assert feature_col in df.columns and target_col in df.columns, "Check your column names"

# Create the crosstab and plot
plt.figure(figsize=(25, 8))
pd.crosstab(df[feature_col], df[target_col]).plot(kind='bar', 
                                                  figsize=(25, 8), 
                                                  color=['gold', 'brown'])
plt.title(f'{target_col.capitalize()} Frequency for {feature_col.capitalize()}')
plt.xlabel(feature_col.capitalize())
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# creates a grid of plots that shows pairwise relationships between all numeric columns in df.
sns.pairplot(data=df)