In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("student_behavior_dataset_v1.csv")

# See all columns
print("Columns in dataset:", df.columns.tolist())

# Quick peek at data
df.head()


Columns in dataset: ['avg_logins_per_week', 'avg_session_minutes', 'active_days_per_week', 'resource_views', 'assignment_late_ratio', 'mean_submission_delay_hours', 'last_minute_ratio', 'missing_assignments', 'pre_deadline_activity_ratio', 'deadline_cramming_score', 'forum_posts', 'messages_sent', 'help_seeking_score', 'night_study_ratio', 'weekend_activity_ratio', 'engagement_trend', 'course_load', 'course_difficulty', 'pass_fail', 'performance_level', 'risk_level']


Unnamed: 0,avg_logins_per_week,avg_session_minutes,active_days_per_week,resource_views,assignment_late_ratio,mean_submission_delay_hours,last_minute_ratio,missing_assignments,pre_deadline_activity_ratio,deadline_cramming_score,...,messages_sent,help_seeking_score,night_study_ratio,weekend_activity_ratio,engagement_trend,course_load,course_difficulty,pass_fail,performance_level,risk_level
0,5.993428,32.926035,4.971533,40,0.248558,17.792128,0.518414,0,0.927201,0.331314,...,0,0.6,0.653481,0.396384,-0.300319,4,3,1,1,0
1,3.796587,62.784173,3.979754,14,0.543467,-11.28186,0.320434,0,0.81806,0.409669,...,0,1.2,0.169841,0.459394,-0.103583,4,1,0,0,1
2,1.983693,51.494705,3.733402,20,0.553705,-8.378299,0.262931,1,0.832269,0.410298,...,1,1.6,0.299795,0.579098,0.048539,5,1,0,0,1
3,3.595894,30.085068,3.411838,10,0.28352,-12.153707,0.204069,1,0.724351,0.0,...,1,1.0,0.309591,0.549996,0.701397,3,3,0,0,1
4,6.173714,67.856834,2.514196,19,0.329424,-13.506634,0.268236,1,0.703302,0.666837,...,0,1.8,0.074641,0.423248,0.113144,5,2,0,0,1


In [None]:
from sklearn.preprocessing import StandardScaler

risk_features = [
    "avg_logins_per_week",
    "avg_session_minutes",
    "late_submissions",
    "missed_deadlines",
    "study_time_hours"
]

risk_features = [col for col in risk_features if col in df.columns]

X_risk = df[risk_features]

# Scale features
scaler = StandardScaler()
X_risk_scaled = scaler.fit_transform(X_risk)


In [None]:
from sklearn.cluster import KMeans

# Cluster into 3 groups: 0=Low, 1=Medium, 2=High risk
kmeans = KMeans(n_clusters=3, random_state=42)
df["risk_cluster"] = kmeans.fit_predict(X_risk_scaled)

# Check cluster distribution
print(df["risk_cluster"].value_counts())


risk_cluster
2    126
1     93
0     81
Name: count, dtype: int64


In [None]:
def assign_recommendation(row):
    if row["risk_cluster"] == 2:
        return "High Risk: Focus on improving engagement and submissions"
    elif row["risk_cluster"] == 1:
        return "Medium Risk: Maintain consistency in study habits"
    else:
        return "Low Risk: Keep up the good performance"

df["recommendation"] = df.apply(assign_recommendation, axis=1)


In [None]:
import plotly.express as px

# Simple scatter plot: logins vs session minutes colored by risk
fig = px.scatter(
    df,
    x="avg_logins_per_week",
    y="avg_session_minutes",
    color="risk_cluster",
    hover_data=["recommendation"],
    title="Student Behavior Dashboard",
    labels={
        "avg_logins_per_week":"Avg Logins per Week",
        "avg_session_minutes":"Avg Session Minutes",
        "risk_cluster":"Risk Cluster"
    }
)
fig.show()
