# Data Preprocessing Exploration

This notebook provides an interactive exploration of the data preprocessing steps performed in the project.

## Table of Contents
1. [Loading the Dataset](#loading-the-dataset)
2. [Exploratory Data Analysis](#exploratory-data-analysis)
3. [Handling Missing Values](#handling-missing-values)
4. [Detecting and Handling Outliers](#detecting-and-handling-outliers)
5. [Feature Scaling](#feature-scaling)
6. [Data Splitting](#data-splitting)
7. [Visualization](#visualization)

## Setup

First, let's import the necessary libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.decomposition import PCA

# Set style for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)
colors = sns.color_palette("viridis", 8)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

# Set random seed for reproducibility
np.random.seed(42)

## 1. Loading the Dataset <a id="loading-the-dataset"></a>

Let's load the dataset and take a look at its structure.

In [None]:
# Load the dataset
df = pd.read_csv('../data/raw/CVD_cleaned.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Check data types
df.dtypes

## 2. Exploratory Data Analysis <a id="exploratory-data-analysis"></a>

Let's explore the dataset to understand its characteristics.

In [None]:
# Summary statistics
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_data = pd.concat([missing_values, missing_percent], axis=1)
missing_data.columns = ['Count', 'Percent']
missing_data[missing_data['Count'] > 0]

In [None]:
# Visualize distributions of numerical features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(3, 3, i+1)
    sns.histplot(df[col], kde=True, color=colors[i])
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Visualize boxplots to identify outliers
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(3, 3, i+1)
    sns.boxplot(y=df[col], color=colors[i])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df[numerical_cols].corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix', fontsize=18)
plt.tight_layout()
plt.show()

## 3. Handling Missing Values <a id="handling-missing-values"></a>

Let's handle any missing values in the dataset.

In [None]:
# Function to handle missing values
def handle_missing_values(df, strategy='median'):
    # Create a copy of the dataframe
    df_processed = df.copy()
    
    # Handle numerical and categorical features separately
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns
    
    # Impute numerical features
    if len(numerical_features) > 0:
        imputer = SimpleImputer(strategy=strategy)
        df_processed[numerical_features] = imputer.fit_transform(df[numerical_features])
    
    # Impute categorical features with most frequent value
    if len(categorical_features) > 0:
        cat_imputer = SimpleImputer(strategy='most_frequent')
        df_processed[categorical_features] = cat_imputer.fit_transform(df[categorical_features])
    
    return df_processed

# Apply missing value handling
df_no_missing = handle_missing_values(df, strategy='median')

# Verify no missing values remain
print(f"Missing values after imputation: {df_no_missing.isnull().sum().sum()}")

## 4. Detecting and Handling Outliers <a id="detecting-and-handling-outliers"></a>

Let's detect and handle outliers in the dataset.

In [None]:
# Function to handle outliers using IQR method
def handle_outliers_iqr(df):
    # Create a copy of the dataframe
    df_processed = df.copy()
    
    # Get numerical features
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    
    # IQR method for outlier detection
    outliers_summary = {}
    for column in numerical_features:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Count outliers
        outliers_count = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
        outliers_summary[column] = outliers_count
        
        # Replace outliers with bounds
        df_processed[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df_processed[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    
    return df_processed, outliers_summary

# Apply outlier handling
df_no_outliers, outliers_summary = handle_outliers_iqr(df_no_missing)

# Display outliers summary
pd.Series(outliers_summary).sort_values(ascending=False)

In [None]:
# Compare boxplots before and after outlier handling
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, len(numerical_cols), i+1)
    sns.boxplot(y=df_no_missing[col], color=colors[i])
    plt.title(f'{col} (Before)')
    
    plt.subplot(2, len(numerical_cols), i+1+len(numerical_cols))
    sns.boxplot(y=df_no_outliers[col], color=colors[i])
    plt.title(f'{col} (After)')
plt.tight_layout()
plt.show()

## 5. Feature Scaling <a id="feature-scaling"></a>

Let's scale the features to have a standard range.

In [None]:
# Function to scale features
def scale_features(df, method='standard'):
    # Create a copy of the dataframe
    df_processed = df.copy()
    
    # Get numerical features
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    
    # Initialize the appropriate scaler
    if method == 'standard':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Unsupported scaling method. Use 'standard' or 'minmax'.")
    
    # Fit and transform the numerical features
    df_processed[numerical_features] = scaler.fit_transform(df[numerical_features])
    
    return df_processed, scaler

# Apply feature scaling
df_scaled, scaler = scale_features(df_no_outliers, method='standard')

# Display scaled data
df_scaled.head()

In [None]:
# Compare distributions before and after scaling
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, len(numerical_cols), i+1)
    sns.histplot(df_no_outliers[col], kde=True, color=colors[i])
    plt.title(f'{col} (Before Scaling)')
    
    plt.subplot(2, len(numerical_cols), i+1+len(numerical_cols))
    sns.histplot(df_scaled[col], kde=True, color=colors[i])
    plt.title(f'{col} (After Scaling)')
plt.tight_layout()
plt.show()

## 6. Data Splitting <a id="data-splitting"></a>

Let's split the data into training and testing sets.

In [None]:
# Identify the target column for cardiovascular disease prediction
target_column = "Heart_Disease"
print(f"Target column: {target_column}")

# Split the data
X = df_scaled.drop(target_column, axis=1)
y = df_scaled[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

## 7. Visualization <a id="visualization"></a>

Let's visualize the processed data.

In [None]:
# PCA visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_train, cmap='viridis', alpha=0.8, edgecolors='w')
plt.colorbar(scatter, label='Target Variable')
plt.title('PCA: 2D Projection of the Dataset', fontsize=18)
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance using Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Create feature importance plot
plt.figure(figsize=(12, 8))
plt.bar(range(X_train.shape[1]), importances[indices], align='center', color=colors)
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.title('Feature Importance from Random Forest', fontsize=18)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.tight_layout()
plt.show()

## Conclusion

In this notebook, we've performed a comprehensive data preprocessing pipeline:

1. Loaded and explored the dataset
2. Handled missing values using median imputation
3. Detected and handled outliers using the IQR method
4. Scaled features using standardization
5. Split the data into training and testing sets
6. Visualized the processed data using PCA and feature importance

The preprocessed data is now ready for model training and evaluation.