# Exploratory Data Analysis (EDA)

This notebook is used for performing exploratory data analysis on the dataset related to school dropout rates. The goal is to understand the data better and visualize key insights.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')

In [None]:
# Load the dataset
data_path = '../data/processed/dataset.csv'  # Update with the correct path
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
df.head()

In [None]:
# Summary statistics
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

In [None]:
# Visualize the distribution of key features
plt.figure(figsize=(10, 6))
sns.histplot(df['feature_name'], bins=30, kde=True)
plt.title('Distribution of Feature Name')
plt.xlabel('Feature Name')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()