In [None]:
import pandas as pd

kidney_disease = pd.read_csv(r'Data/kidney_disease/original_dataset.csv')

# Exploratory Data Analysis

## Parameters

In [None]:
# irrelevant columns
irrelevant_columns = ['id']

## Drop irrelevant columns

In [None]:
kidney_disease = kidney_disease.drop(columns=irrelevant_columns)

### Shape

In [None]:
kidney_disease.shape

## Features meaning

**Note : This step is only relevant in the context of the chronic kidney disease**

- Specific_gravity:  urine specific gravity results will fall [1.002 , 1.030] if your kidneys are functioning normally (determine how well your kidneys are diluting your urine).

    - Higher the number of specific gravity -> dehydrated
    - Lower the number of specific gravity ->  hydrated
    - Good health = [1.002 , 1.030]


- Albumin: is a protein found in the blood. A healthy kidney doesn't let albumin pass from the blood into the urine. A damaged kidney lets some albumin pass into the urine. The less albumin in your urine, the better.


    - Good health = 0 (maybe)


- Red_blood_cells : (Anemia is common in people who have chronic kidney disease Anemia is common in people who have chronic kidney disease). Red blood cells are made by the bone marrow. To get the marrow to make red blood cells, the kidneys make a hormone called erythropoietin, or EPO. When the kidneys are damaged, they may not make enough EPO. Without enough EPO, the bone marrow does not make enough red blood cells, and you have anemia.


    - Normal -> good health
    - Abnormal -> sick
    

- Sugar :


    - 0 -> good health
    - 0 > (greater then 0) sick (not sure)
    

- Pus cells are a collection of dead, white blood cells that accumulates when the body’s immune system activates in response to an infection:


    - Normal -> small quantite in the urine (health person)
    - Abnormal -> not .. (maybe sick)


- pus_cell_clumps the present in the urine:


    - notpresent -> not sick 
    - present -> maybe sick
    

- Bacteria the present in the urine:


    - notpresent -> not sick (maybe)
    - present -> maybe sick
    
    

- blood_glucose_random the present in the urine:


    - the recommended targets for most people are:
        - Before meals: 90-130 mg/dL
        
        
    - Two hours after the start of a meal: Below 180 mg/dL
        - 90 – 180 -> maybe health
        - Else maybe sick



- blood urea : Urea nitrogen is a waste product that your kidneys remove from your blood. Higher than normal BUN levels may be a sign that your kidneys aren't working well:


    - 7 - 20 mg/dL is a normal level -> maybe healthy
    - Else maybe sick  
    

- pus_cell_clumps the present in the urine:


    - notpresent -> not sick 
    - present -> maybe sick


## Duplicate rows check

In [None]:
# Find duplicate rows
duplicates = kidney_disease[kidney_disease.duplicated()]

# Print the duplicate rows
display(duplicates)

**Conclusion**
- No duplicates

## Features and target type

In [None]:
kidney_disease.dtypes

**Conclusion :**
- We need to handle the features with an object type and convert them to number types (int, float)

### Feature types : unique values

For each column that have an object type, we check the unique values

In [None]:
for col in kidney_disease:
    if kidney_disease[col].dtype==object:
        print(col+'\t', kidney_disease[col].unique())

**Conclusion:**
    - classification , coronary_artery_disease , diabetes_mellitus : need to change a correct format like ckd\t become ckd (Noisy)
    - red_blood_cell_count , white_blood_cell_count , packed_cell_volume : need to be converted to number

## Percentage of null value

In [None]:
(kidney_disease.isnull().sum() / len(kidney_disease))*100

In [None]:
kidney_disease.dtypes

**Need first to handle empty value before doing correlation because we have some high percentage**

## Data distribution

### Numeric features

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

continuous_cols = kidney_disease.select_dtypes(include=['float64', 'int64']).columns.tolist()
discrete_cols = kidney_disease.select_dtypes(include=['object']).columns.tolist()

# The following columns are not discrete
discrete_cols.remove('pcv')
discrete_cols.remove('wc')
discrete_cols.remove('rc')
print("Continuous Columns:\n", "  - ".join(continuous_cols))
print("\nDiscrete Columns:\n", "  - ".join(discrete_cols))

fig, axs = plt.subplots(6, 2, figsize=(10, 10))

for index,col in enumerate(continuous_cols):

    sns.histplot(data=kidney_disease[col],ax=axs[index//2 , index%2] ,kde=True)

    # set the title and axis labels
    axs[index//2 , index%2].set_title('The distribution of '+ col +' in the dataset ')
    axs[index//2 , index%2].set_xlabel(col)
    axs[index//2 , index%2].set_ylabel('Frequency')


plt.tight_layout()
plt.show()

### Categorical features

In [None]:
import math

num_cols = len(discrete_cols)
num_rows = math.ceil(num_cols / 2)
fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(10, 3*num_rows))
axes = axes.flatten()

for i, col in enumerate(discrete_cols):
    ax = axes[i]
    kidney_disease[col].value_counts().plot(kind='bar', ax=ax)
    ax.set_title(col)
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')

# Remove any unused subplots
if num_cols < num_rows * 2:
    for j in range(num_cols, num_rows * 2):
        fig.delaxes(axes[j])

fig.tight_layout()
plt.show()


## Outliers

In [None]:
import numpy as np

# Calculate the first and third quartiles
Q1 = kidney_disease.quantile(0.25, numeric_only=True)
Q3 = kidney_disease.quantile(0.75, numeric_only=True)

# Calculate the IQR
IQR = Q3 - Q1

# Find the outliers
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR
outliers = ((kidney_disease < lower_bound) | (kidney_disease > upper_bound)).sum(axis=0)

# Check if there are any outliers
if outliers.sum() == 0:
    print("No columns with outliers.")
else:
    # Calculate the percentage of data that is an outlier for each column
    n_rows = kidney_disease.shape[0]
    outliers_percent = outliers/n_rows*100

    # Print the columns with outliers and their impact
    print("Columns with outliers:")
    outliers_df = pd.DataFrame(outliers_percent, index=outliers.index, columns=["Outliers (%)"])
    print(outliers_df[outliers_df["Outliers (%)"] > 0])

### Outliers visualization

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 10))
sns.boxplot(data=kidney_disease)
plt.xticks(rotation=90)
plt.title("Box Plots of All Columns in the Dataset")
plt.show()