**Project 02 :Student Performance Analytics Dashboard**

***Installing neccesary libraries***

In [None]:
# %pip install pandas
# %pip install numpy
# %pip install matplot
# %pip install scikit-learn
# %pip install seaborn

***Importing Required Libraries***

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

***Importing Student Data***

In [None]:
data = pd.read_csv("student_data1.csv")
print(data)
# Clean column names: remove spaces and lowercase
data.columns = [col.strip().lower().replace(" ", "") for col in data.columns]
# Fix attendance column: handle 'attendance(%)' and standardize to 'attendance'
if "attendance(%)" in data.columns:
    data = data.rename(columns={"attendance(%)": "attendance"})

***Calculation of average marks, attendance, and logins***

In [None]:
marks_avg = data["marks"].mean()
attendance_avg = data["attendance"].mean()
logins_avg = data["logins"].mean()
print(f"Average Marks: {marks_avg:.2f}")
print(f"Average Attendance: {attendance_avg:.2f}")
print(f"Average Logins: {logins_avg:.2f}")

***Correlation Matrix***

In [None]:
corr = data[["marks", "attendance", "logins"]].corr()
print("\nCorrelation Matrix:")
print(corr)

***Correlation Heatmap***

In [None]:
plt.figure(figsize=(7,5))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Between Features")
plt.tight_layout()
plt.show()

***Defining Risk based on performance and attendence***

In [None]:
data["risk"] = np.where((data["marks"] < 40) | (data["attendance"] < 60), 1, 0)

***Absentee impact (Barplot: risk vs attendance)***

In [None]:
plt.figure(figsize=(7,5))
sns.barplot(x="risk", y="attendance", data=data, hue="risk", palette="Reds", legend=False)
plt.xlabel("Risk Level (0 = Safe, 1 = At Risk)")
plt.ylabel("Attendance")
plt.title("Absentee Impact on Student Risk")
plt.tight_layout()
plt.show()

***Top vs Struggling Students (Barplot: risk vs marks)***

In [None]:
plt.figure(figsize=(7,5))
sns.barplot(x="risk", y="marks", data=data, hue="risk", palette="Blues", legend=False)
plt.xlabel("Risk Level (0 = Safe, 1 = At Risk)")
plt.ylabel("Marks")
plt.title("Performance Comparison of Students")
plt.tight_layout()
plt.show()

***Preparing training and test sets for model***

In [None]:
# Selecting features and target
X = data[["marks", "attendance", "logins"]]
y = data["risk"]

# Spliting dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

***Training Random Forest Classifier and Predicting At-Risk Students***

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)


y_pred = model.predict(X_test_scaled)


num_at_risk = data["risk"].sum()
total_students = len(data)
percent_at_risk = (num_at_risk / total_students) * 100
print(f"\nNumber of At-Risk Students: {num_at_risk} ({percent_at_risk:.1f}%)")
# Top 10 At-Risk Students with lowest marks and attendance
at_risk_students = data[data["risk"] == 1].sort_values(by=["marks", "attendance"])
print("\nTop 10 At-Risk Students (Lowest Marks & Attendance):")
print(at_risk_students[["marks", "attendance", "logins"]].head(10))

***Distribution of marks and attendance for At-Risk vs Safe Students***

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.boxplot(x="risk", y="marks", data=data, hue="risk", palette="Set2", legend=False)
plt.title("Marks Distribution by Risk")
plt.xlabel("Risk (0=Safe, 1=At Risk)")
plt.subplot(1,2,2)
sns.boxplot(x="risk", y="attendance", data=data, hue="risk", palette="Set1", legend=False)
plt.title("Attendance Distribution by Risk")
plt.xlabel("Risk (0=Safe, 1=At Risk)")
plt.tight_layout()
plt.show()

***Bar chart: Count of At-risk vs Safe Students***

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(x="risk", data=data, hue="risk", palette="pastel", legend=False)
plt.title("Count of Safe vs At-Risk Students")
plt.xlabel("Risk (0=Safe, 1=At Risk)")
plt.ylabel("Number of Students")
plt.tight_layout()
plt.show()
print(f"Number of students At-risk :{num_at_risk}")

***Engagement score: weighted sum of attendance and logins***

In [None]:
data["engagement_score"] = 0.7 * data["attendance"] + 0.3 * (data["logins"] / data["logins"].max() * 100)
print("\nEngagement Scores:")
print(data[["marks", "attendance", "logins", "engagement_score"]].head())

***Distribution of Engagement score***

In [None]:
plt.figure(figsize=(7,4))
sns.histplot(data["engagement_score"], bins=20, kde=True, color="purple")
plt.title("Distribution of Engagement Scores")
plt.xlabel("Engagement Score")
plt.ylabel("Number of Students")
plt.tight_layout()
plt.show()

***Actions for At-Risk students***

In [None]:
def suggest_intervention(row):
    suggestions = []
    if row["marks"] < 40:
        suggestions.append("Academic coaching")
    if row["attendance"] < 60:
        suggestions.append("Attendance counseling")
    if row["engagement_score"] < 60:
        suggestions.append("Mentorship program")
    return ", ".join(suggestions) if suggestions else "No intervention needed"

at_risk_students = data[data["risk"] == 1].copy()
at_risk_students["suggestion"] = at_risk_students.apply(suggest_intervention, axis=1)
print("\nIntervention Suggestions for Top 10 At-Risk Students:")
print(at_risk_students[["marks", "attendance", "logins", "engagement_score", "suggestion"]].sort_values(by=["marks", "attendance"]).head(10))