In [None]:
'''MARKDOWN'''
# Student Performance Analytics â€“ Exploratory Data Analysis (EDA)

'''This notebook performs exploratory data analysis on the student performance dataset.
The goal is to understand key features and their relationship with student outcomes
before applying machine learning models.'''


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


In [None]:
# Load processed dataset
df = pd.read_csv("C:\Users\hendr\Desktop\student-performance-analytics\data\StudentsPerformance.csv")

# Display first few rows
df.head()


In [None]:
# Dataset shape
print("Shape of dataset:", df.shape)

# Dataset info
df.info()



In [None]:
df.describe()


In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df["final_score"], bins=20, kde=True)
plt.title("Distribution of Final Scores")
plt.xlabel("Final Score")
plt.ylabel("Frequency")
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.scatterplot(x="study_hours", y="final_score", data=df)
plt.title("Study Hours vs Final Score")
plt.xlabel("Study Hours")
plt.ylabel("Final Score")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


In [None]:
# Create pass/fail column for analysis
df["pass_fail"] = np.where(df["final_score"] >= 40, 1, 0)

sns.countplot(x="pass_fail", data=df)
plt.title("Pass / Fail Distribution")
plt.xlabel("Pass (1) / Fail (0)")
plt.ylabel("Count")
plt.show()



In [None]:
## Observations
'''
- Final scores show a reasonable spread across students
- Study hours have a positive relationship with final scores
- Some features are moderately correlated with performance
- The dataset is suitable for regression and classification tasks

This analysis supports the use of supervised machine learning models
for predicting student performance.
'''