In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Step 1: Load the data
data = pd.read_csv('../data/processed/merged_student_data.csv')

# Step 2: Data Overview
print("Data Overview:")
print(data.info())
print('---')
print('---')
print(data.describe(include='all'))

In [14]:
# Missing Values Analysis
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values[missing_values > 0])

Missing Values:
Reason_for_Absence              16
Parents_Education_Level         12
Library_Access                  20
Teacher_Hours_Assigned          20
Study_Materials_Availability    20
Electricity_Access              20
dtype: int64


In [None]:
# Distribution of Age
plt.figure(figsize=(10, 6))
sns.histplot(data['Age'], bins=10, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('data/images/age_distribution.png')
plt.clf()

In [None]:
# Distribution of Monthly Income
plt.figure(figsize=(10, 6))
sns.histplot(data['Monthly_Income'], bins=20, kde=True)
plt.title('Monthly Income Distribution')
plt.xlabel('Monthly Income')
plt.ylabel('Frequency')
plt.savefig('data/images/monthly_income_distribution.png')
plt.clf()

In [None]:
# Select only numerical columns
numerical_data = data.select_dtypes(include='number')

# Correlation Analysis
correlation_matrix = numerical_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.savefig('data/images/correlation_matrix.png')
plt.clf()

In [None]:
# Gender Distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='Gender')
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.savefig('data/images/gender_distribution.png')
plt.clf()

In [None]:
# Attendance Consistency Distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='Attendance_Consistency')
plt.title('Attendance Consistency Distribution')
plt.xlabel('Attendance Consistency')
plt.ylabel('Count')
plt.savefig('data/images/attendance_consistency_distribution.png')
plt.clf()

In [None]:
# Step 7: Impact on Academics Analysis
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Impact_on_Academics')
plt.title('Impact on Academics')
plt.xlabel('Impact Level')
plt.ylabel('Count')
plt.savefig('data/images/impact_on_academics_distribution.png')
plt.clf()

In [None]:
# Ensure images directory exists
os.makedirs('data/images', exist_ok=True)

# Final message
print("EDA completed. Visualizations saved in the 'data/images' folder.")