# Exploratory Data Analysis (EDA)

This notebook walks through common EDA steps using a small sample dataset of student study hours and grades.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sample dataset
df = pd.DataFrame({
    'student_id': range(1,21),
    'hours_studied': [1,2,3,2,5,6,7,8,5,4,3,6,7,8,2,1,9,10,4,5],
    'grade': [55,60,65,63,72,75,78,85,70,68,66,80,82,88,62,58,90,92,69,74]
})
df.head()

In [None]:
## Inspect structure & summary stats
# Save the EDA sample dataset to CSV and read it back
csv_path = 'eda_students_sample.csv'
df.to_csv(csv_path, index=False)
print('Saved', csv_path)
pd.read_csv(csv_path).head()

In [None]:
df.info()
df.describe()

## Missing values & duplicates

In [None]:
df.isnull().sum()
df.duplicated().sum()

## Univariate plots: histograms & boxplots

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.histplot(df['hours_studied'], bins=8)
plt.title('Hours Studied')

plt.subplot(1,2,2)
sns.boxplot(x=df['grade'])
plt.title('Grade Boxplot')
plt.show()

## Bivariate analysis: scatter (hours vs grade) and correlation

In [None]:
plt.figure(figsize=(6,4))
sns.scatterplot(data=df, x='hours_studied', y='grade')
sns.regplot(data=df, x='hours_studied', y='grade', scatter=False, color='red')
plt.title('Hours Studied vs Grade')
plt.show()

# Correlation
df[['hours_studied','grade']].corr()

## Multivariate quick check: pairplot and heatmap

In [None]:
sns.pairplot(df[['hours_studied','grade']])
plt.show()

plt.figure(figsize=(4,3))
sns.heatmap(df[['hours_studied','grade']].corr(), annot=True, cmap='coolwarm')
plt.show()

## Next steps after EDA:
* Form hypotheses (e.g., more study hours → higher grades)
* Prepare features for modeling or deeper analysis
* Segment students by performance groups for targeted interventions