In [1]:
pip install pandas numpy faker


Collecting faker
  Downloading faker-40.1.2-py3-none-any.whl.metadata (16 kB)
Downloading faker-40.1.2-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-40.1.2


In [4]:
import random
import pandas as pd
from faker import Faker
from datetime import datetime, timedelta
import numpy as np

fake = Faker()
random.seed(42)
np.random.seed(42)

# --------------------
# CONFIG
# --------------------
NUM_STUDENTS = 150
NUM_COURSES = 6
ACTIVITIES_PER_COURSE = 10
START_DATE = datetime(2025, 9, 1)

# --------------------
# STUDENTS
# --------------------
students = []
for i in range(NUM_STUDENTS):
    students.append({
        "student_id": f"S{i+1:03}",
        "enrollment_year": 2022,
        "program": "Computer Science",
        "level": random.choice([3, 4])
    })
students_df = pd.DataFrame(students)

# --------------------
# COURSES
# --------------------
courses = []
for i in range(NUM_COURSES):
    courses.append({
        "course_id": f"C{i+1:02}",
        "course_name": f"Course {i+1}",
        "semester": "Fall 2025"
    })
courses_df = pd.DataFrame(courses)

# --------------------
# ENROLLMENTS
# --------------------
enrollments = []
for s in students_df.student_id:
    enrolled_courses = random.sample(list(courses_df.course_id), k=4)
    for c in enrolled_courses:
        enrollments.append({"student_id": s, "course_id": c})
enrollments_df = pd.DataFrame(enrollments)

# --------------------
# ACTIVITIES
# --------------------
activities = []
for c in courses_df.course_id:
    for i in range(ACTIVITIES_PER_COURSE):
        act_type = random.choice(["quiz", "assignment", "resource"])
        activities.append({
            "activity_id": f"{c}_A{i+1}",
            "course_id": c,
            "type": act_type,
            "title": f"{act_type.capitalize()} {i+1}",
            "due_date": START_DATE + timedelta(days=random.randint(10, 90)) if act_type != "resource" else None,
            "max_grade": 10 if act_type != "resource" else None
        })
activities_df = pd.DataFrame(activities)

# --------------------
# ACTIVITY LOGS
# --------------------
activity_logs = []
for _, row in enrollments_df.iterrows():
    student_id = row.student_id
    course_id = row.course_id
    course_acts = activities_df[activities_df.course_id == course_id]

    engagement_level = random.choice(["high", "medium", "low"])
    log_count = {"high": 120, "medium": 70, "low": 30}[engagement_level]

    for _ in range(log_count):
        act = course_acts.sample(1).iloc[0]
        activity_logs.append({
            "student_id": student_id,
            "course_id": course_id,
            "activity_id": act.activity_id,
            "event_type": random.choice(["view", "attempt", "submit"]),
            "time_spent_sec": random.randint(30, 900),
            "timestamp": START_DATE + timedelta(days=random.randint(1, 120))
        })

activity_logs_df = pd.DataFrame(activity_logs)

# --------------------
# QUIZ ATTEMPTS
# --------------------
quiz_attempts = []
for _, log in activity_logs_df.iterrows():
    if activities_df.loc[activities_df.activity_id == log.activity_id, "type"].values[0] == "quiz":
        quiz_attempts.append({
            "attempt_id": fake.uuid4(),
            "student_id": log.student_id,
            "activity_id": log.activity_id,
            "score": random.uniform(3, 10),
            "max_score": 10,
            "time_spent_sec": log.time_spent_sec,
            "attempt_number": random.randint(1, 2),
            "submitted_at": log.timestamp
        })
quiz_attempts_df = pd.DataFrame(quiz_attempts)

# --------------------
# ASSIGNMENT SUBMISSIONS
# --------------------
assignment_subs = []
for _, row in activities_df.iterrows():
    if row.type == "assignment":
        enrolled_students = enrollments_df[enrollments_df.course_id == row.course_id].student_id
        for s in enrolled_students:
            submitted = random.random() > 0.15
            if submitted:
                assignment_subs.append({
                    "submission_id": fake.uuid4(),
                    "student_id": s,
                    "activity_id": row.activity_id,
                    "score": random.uniform(4, 10),
                    "max_score": 10,
                    "submitted_at": row.due_date + timedelta(days=random.randint(-2, 3)),
                    "late_submission": random.random() < 0.2
                })

assignment_subs_df = pd.DataFrame(assignment_subs)

# --------------------
# COURSE RESULTS
# --------------------
course_results = []
for _, row in enrollments_df.iterrows():
    avg_score = random.uniform(4, 9)
    course_results.append({
        "student_id": row.student_id,
        "course_id": row.course_id,
        "final_grade": avg_score,
        "status": "PASS" if avg_score >= 5 else "FAIL"
    })
course_results_df = pd.DataFrame(course_results)

# --------------------
# SAVE FILES
# --------------------
students_df.to_csv("students.csv", index=False)
courses_df.to_csv("courses.csv", index=False)
enrollments_df.to_csv("enrollments.csv", index=False)
activities_df.to_csv("activities.csv", index=False)
activity_logs_df.to_csv("activity_logs.csv", index=False)
quiz_attempts_df.to_csv("quiz_attempts.csv", index=False)
assignment_subs_df.to_csv("assignment_submissions.csv", index=False)
course_results_df.to_csv("course_results.csv", index=False)

print("Synthetic Moodle-style dataset generated successfully.")


Synthetic Moodle-style dataset generated successfully.


In [5]:
import pandas as pd

logs = pd.read_csv("activity_logs.csv")
quizzes = pd.read_csv("quiz_attempts.csv")
assignments = pd.read_csv("assignment_submissions.csv")
results = pd.read_csv("course_results.csv")


Inspect columns & dtypes B. Check missing values C. Basic distributions (key behavioral signals)



In [None]:
logs.info()
quizzes.info()
assignments.info()
results.info()

logs.isna().sum()
quizzes.isna().sum()
assignments.isna().sum()
results.isna().sum()

logs["time_spent_sec"].describe()
quizzes["score"].describe()
assignments["score"].describe()



Scaling Features

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_cluster = scaler.fit_transform(clustering_features)

clustering_features = features[
    [
        "total_time_spent",
        "total_events",
        "active_days",
        "avg_time_per_event",
        "access_frequency"
    ]
]




Engagement Features

In [6]:
engagement = logs.groupby(["student_id", "course_id"]).agg(
    total_time_spent=("time_spent_sec", "sum"),
    total_events=("event_type", "count"),
    avg_time_per_event=("time_spent_sec", "mean"),
    active_days=("timestamp", lambda x: x.nunique())
).reset_index()

engagement["access_frequency"] = engagement["total_events"] / engagement["active_days"]


Quiz Features

In [7]:
quiz_features = quizzes.groupby(
    ["student_id", quizzes["activity_id"].str[:3]]
).agg(
    avg_quiz_score=("score", "mean"),
    quiz_attempt_count=("attempt_id", "count"),
    quiz_score_std=("score", "std"),
    avg_quiz_time=("time_spent_sec", "mean")
).reset_index().rename(columns={"activity_id": "course_id"})


Assignement Features

In [8]:
assignment_features = assignments.groupby(
    ["student_id", assignments["activity_id"].str[:3]]
).agg(
    avg_assignment_score=("score", "mean"),
    late_submission_ratio=("late_submission", "mean"),
    assignment_count=("submission_id", "count")
).reset_index().rename(columns={"activity_id": "course_id"})


In [None]:
Merge Everything & Fill missing values

In [11]:
features = engagement \
    .merge(quiz_features, on=["student_id", "course_id"], how="left") \
    .merge(assignment_features, on=["student_id", "course_id"], how="left") \
    .merge(results, on=["student_id", "course_id"], how="left")

features.fillna(0, inplace=True)


Behaviour Flags

In [12]:
features["low_engagement_flag"] = (features["total_time_spent"] < 3000).astype(int)
features["high_time_low_score_flag"] = (
    (features["total_time_spent"] > 10000) &
    (features["avg_quiz_score"] < 5)
).astype(int)


In [13]:
features.to_csv("student_course_features.csv", index=False)
