# Univariate, Bivariate, and Multivariate Analysis

This notebook contains comprehensive analysis including:
- Univariate Analysis: Analysis of individual variables
- Bivariate Analysis: Analysis of relationships between two variables
- Multivariate Analysis: Analysis of relationships among multiple variables


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load the dataset
df = pd.read_csv('../../data/Cardiovascular_Disease_Dataset.csv')

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()


## 1. Univariate Analysis

Univariate analysis involves examining each variable individually to understand its distribution, central tendency, dispersion, and other characteristics.


In [None]:
# Define numerical and categorical variables
numerical_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak']
categorical_cols = ['gender', 'chestpain', 'fastingbloodsugar', 'restingrelectro', 
                    'exerciseangia', 'slope', 'noofmajorvessels', 'target']

# Univariate Analysis for Numerical Variables
print("=" * 80)
print("UNIVARIATE ANALYSIS - NUMERICAL VARIABLES")
print("=" * 80)

for col in numerical_cols:
    if col in df.columns:
        print(f"\n{col.upper()}:")
        print(f"  Count: {df[col].count()}")
        print(f"  Mean: {df[col].mean():.2f}")
        print(f"  Median: {df[col].median():.2f}")
        print(f"  Mode: {df[col].mode()[0] if not df[col].mode().empty else 'N/A'}")
        print(f"  Std Dev: {df[col].std():.2f}")
        print(f"  Variance: {df[col].var():.2f}")
        print(f"  Min: {df[col].min():.2f}")
        print(f"  Max: {df[col].max():.2f}")
        print(f"  Range: {df[col].max() - df[col].min():.2f}")
        print(f"  Q1: {df[col].quantile(0.25):.2f}")
        print(f"  Q3: {df[col].quantile(0.75):.2f}")
        print(f"  IQR: {df[col].quantile(0.75) - df[col].quantile(0.25):.2f}")
        print(f"  Skewness: {df[col].skew():.4f}")
        print(f"  Kurtosis: {df[col].kurtosis():.4f}")
        
        # Outlier detection using IQR method
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        print(f"  Outliers (IQR method): {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")


In [None]:
# Visualizations for Univariate Analysis - Numerical Variables
fig, axes = plt.subplots(len(numerical_cols), 3, figsize=(18, 4*len(numerical_cols)))

for idx, col in enumerate(numerical_cols):
    if col in df.columns:
        # Histogram
        axes[idx, 0].hist(df[col].dropna(), bins=30, color='steelblue', edgecolor='black', alpha=0.7)
        axes[idx, 0].set_title(f'Histogram: {col}', fontweight='bold')
        axes[idx, 0].set_xlabel(col)
        axes[idx, 0].set_ylabel('Frequency')
        axes[idx, 0].axvline(df[col].mean(), color='red', linestyle='--', label='Mean')
        axes[idx, 0].axvline(df[col].median(), color='green', linestyle='--', label='Median')
        axes[idx, 0].legend()
        
        # Box Plot
        axes[idx, 1].boxplot(df[col].dropna(), vert=True)
        axes[idx, 1].set_title(f'Box Plot: {col}', fontweight='bold')
        axes[idx, 1].set_ylabel(col)
        axes[idx, 1].grid(True, alpha=0.3)
        
        # Q-Q Plot
        stats.probplot(df[col].dropna(), dist="norm", plot=axes[idx, 2])
        axes[idx, 2].set_title(f'Q-Q Plot: {col}', fontweight='bold')
        axes[idx, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Univariate Analysis for Categorical Variables
print("=" * 80)
print("UNIVARIATE ANALYSIS - CATEGORICAL VARIABLES")
print("=" * 80)

for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col.upper()}:")
        value_counts = df[col].value_counts()
        print(value_counts)
        print(f"\n  Percentage distribution:")
        for val, count in value_counts.items():
            print(f"    {val}: {count} ({count/len(df)*100:.2f}%)")
        print(f"  Mode: {df[col].mode()[0] if not df[col].mode().empty else 'N/A'}")
        print(f"  Unique values: {df[col].nunique()}")


In [None]:
# Visualizations for Univariate Analysis - Categorical Variables
categorical_for_vis = ['gender', 'chestpain', 'fastingbloodsugar', 'exerciseangia', 'target']
n_cats = len(categorical_for_vis)
fig, axes = plt.subplots(2, (n_cats + 1) // 2, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(categorical_for_vis):
    if col in df.columns:
        value_counts = df[col].value_counts().sort_index()
        axes[idx].bar(value_counts.index.astype(str), value_counts.values, color='coral', alpha=0.7)
        axes[idx].set_title(f'Bar Chart: {col}', fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Count')
        axes[idx].grid(True, alpha=0.3, axis='y')
        for i, v in enumerate(value_counts.values):
            axes[idx].text(i, v + 5, str(v), ha='center', fontweight='bold')

# Remove extra subplots
for idx in range(len(categorical_for_vis), len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()


## 2. Bivariate Analysis

Bivariate analysis involves examining the relationship between two variables.


In [None]:
# Correlation Analysis - Numerical vs Numerical
print("=" * 80)
print("BIVARIATE ANALYSIS - CORRELATION BETWEEN NUMERICAL VARIABLES")
print("=" * 80)

correlation_matrix = df[numerical_cols].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Numerical Variables', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()


In [None]:
# Scatter plots for numerical variables vs target
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    if col in df.columns and idx < len(axes):
        # Scatter plot with target coloring
        scatter = axes[idx].scatter(df[col], df['target'], alpha=0.5, c=df['target'], cmap='viridis')
        axes[idx].set_title(f'{col} vs Target', fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Target')
        axes[idx].grid(True, alpha=0.3)

# Remove extra subplot
if len(numerical_cols) < len(axes):
    fig.delaxes(axes[len(numerical_cols)])

plt.tight_layout()
plt.show()


In [None]:
# Numerical vs Categorical (Target) - Group comparisons
print("=" * 80)
print("BIVARIATE ANALYSIS - NUMERICAL VARIABLES VS TARGET")
print("=" * 80)

for col in numerical_cols:
    if col in df.columns:
        print(f"\n{col.upper()} by Target:")
        grouped = df.groupby('target')[col].agg(['mean', 'median', 'std', 'min', 'max'])
        print(grouped)
        
        # Statistical test
        group_0 = df[df['target'] == 0][col].dropna()
        group_1 = df[df['target'] == 1][col].dropna()
        statistic, p_value = stats.ttest_ind(group_0, group_1)
        print(f"  t-test p-value: {p_value:.4f} ({'Significant' if p_value < 0.05 else 'Not significant'})")


In [None]:
# Categorical vs Categorical - Cross-tabulation and Chi-square tests
print("=" * 80)
print("BIVARIATE ANALYSIS - CATEGORICAL VARIABLES VS TARGET")
print("=" * 80)

categorical_vars = ['gender', 'chestpain', 'fastingbloodsugar', 'restingrelectro', 
                    'exerciseangia', 'slope', 'noofmajorvessels']

for var in categorical_vars:
    if var in df.columns:
        print(f"\n{var.upper()} vs TARGET:")
        crosstab = pd.crosstab(df[var], df['target'], margins=True)
        print(crosstab)
        
        # Chi-square test
        contingency_table = pd.crosstab(df[var], df['target'])
        chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
        print(f"  Chi-square statistic: {chi2:.4f}")
        print(f"  p-value: {p_value:.4f}")
        print(f"  Significant association: {'Yes' if p_value < 0.05 else 'No'} (Î± = 0.05)")
        
        # Percentage distribution
        print(f"\n  Percentage by Target:")
        crosstab_pct = pd.crosstab(df[var], df['target'], normalize='index') * 100
        print(crosstab_pct.round(2))


In [None]:
# Violin plots for numerical variables by target
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    if col in df.columns and idx < len(axes):
        data_to_plot = [df[df['target'] == 0][col].dropna(), 
                        df[df['target'] == 1][col].dropna()]
        axes[idx].violinplot(data_to_plot, positions=[0, 1], showmeans=True)
        axes[idx].set_xticks([0, 1])
        axes[idx].set_xticklabels(['No Disease', 'Disease'])
        axes[idx].set_title(f'{col} by Target', fontweight='bold')
        axes[idx].set_ylabel(col)
        axes[idx].grid(True, alpha=0.3, axis='y')

# Remove extra subplot
if len(numerical_cols) < len(axes):
    fig.delaxes(axes[len(numerical_cols)])

plt.tight_layout()
plt.show()


## 3. Multivariate Analysis

Multivariate analysis involves examining relationships among multiple variables simultaneously.


In [None]:
# Pairplot for multivariate relationships
print("Creating pairplot (this may take a moment)...")
sample_df = df[numerical_cols + ['target']].sample(min(500, len(df)), random_state=42)
sns.pairplot(sample_df, hue='target', diag_kind='kde', palette='Set2', height=2.5)
plt.suptitle('Pairplot of Numerical Variables by Target', y=1.02, fontsize=16, fontweight='bold')
plt.show()


In [None]:
# Principal Component Analysis (PCA)
print("=" * 80)
print("MULTIVARIATE ANALYSIS - PRINCIPAL COMPONENT ANALYSIS (PCA)")
print("=" * 80)

# Prepare data for PCA
X = df[numerical_cols].dropna()
X_scaled = StandardScaler().fit_transform(X)

# Perform PCA
pca = PCA()
pca.fit(X_scaled)

# Explained variance
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

print("\nExplained Variance by Component:")
for i, (var, cum_var) in enumerate(zip(explained_variance, cumulative_variance)):
    print(f"  PC{i+1}: {var:.4f} ({var*100:.2f}%) - Cumulative: {cum_var:.4f} ({cum_var*100:.2f}%)")

# Scree plot
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(explained_variance) + 1), explained_variance, 'bo-')
plt.title('Scree Plot', fontweight='bold')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'ro-')
plt.title('Cumulative Explained Variance', fontweight='bold')
plt.xlabel('Principal Component')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.axhline(y=0.95, color='g', linestyle='--', label='95% variance')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Transform data to first 2 principal components
pca_2d = PCA(n_components=2)
X_pca = pca_2d.fit_transform(X_scaled)

# Plot first two principal components
plt.figure(figsize=(10, 8))
target_labels = df.loc[X.index, 'target']
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=target_labels, cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label='Target')
plt.title('PCA: First Two Principal Components', fontsize=16, fontweight='bold')
plt.xlabel(f'PC1 ({explained_variance[0]*100:.2f}% variance)')
plt.ylabel(f'PC2 ({explained_variance[1]*100:.2f}% variance)')
plt.grid(True, alpha=0.3)
plt.show()

print(f"\nFirst two principal components explain {cumulative_variance[1]*100:.2f}% of variance")


In [None]:
# Feature importance based on correlation with target
print("=" * 80)
print("FEATURE IMPORTANCE - CORRELATION WITH TARGET")
print("=" * 80)

feature_importance = {}
for col in numerical_cols:
    if col in df.columns:
        correlation = df[col].corr(df['target'])
        feature_importance[col] = abs(correlation)

# Sort by importance
sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

print("\nFeature Importance (absolute correlation with target):")
for feature, importance in sorted_importance:
    print(f"  {feature}: {importance:.4f}")

# Visualize feature importance
plt.figure(figsize=(10, 6))
features, importances = zip(*sorted_importance)
plt.barh(features, importances, color='steelblue')
plt.xlabel('Absolute Correlation with Target')
plt.title('Feature Importance Based on Correlation with Target', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()


In [None]:
# Multivariate correlation heatmap with target
plt.figure(figsize=(12, 8))
correlation_with_target = df[numerical_cols + ['target']].corr()
sns.heatmap(correlation_with_target, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Multivariate Correlation Matrix (Including Target)', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()


## 4. Key Findings

### Univariate Analysis:
- Distribution characteristics of each variable
- Identification of outliers
- Central tendency and dispersion measures
- Normality assessment

### Bivariate Analysis:
- Correlation between numerical variables
- Relationships between variables and target
- Statistical significance of associations
- Group differences

### Multivariate Analysis:
- Principal component analysis reveals data structure
- Feature importance ranking
- Multivariate relationships and interactions
- Dimensionality reduction insights
