In [2]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate non-dropouts (500)
non_dropouts = pd.DataFrame({
    'student_id': [f"S{i:03d}_D0" for i in range(1, 501)],
    'lecture_watch_pct': np.clip(np.random.normal(85, 10, 500), 70, 100).astype(int),
    'checklist_pct': np.clip(np.random.normal(80, 12, 500), 65, 100).astype(int),
    'attended_live_class': np.random.binomial(1, 0.78, 500),
    'attended_group_discussion': np.random.binomial(1, 0.72, 500),
    'qa_participation_pct': np.clip(np.random.normal(75, 15, 500), 50, 100).astype(int),
    'dropout': 0
})

# Generate dropouts (500)
dropouts = pd.DataFrame({
    'student_id': [f"S{i:03d}_D1" for i in range(1, 501)],
    'lecture_watch_pct': np.clip(np.random.normal(50, 15, 500), 20, 70).astype(int),
    'checklist_pct': np.clip(np.random.normal(45, 12, 500), 15, 65).astype(int),
    'attended_live_class': np.random.binomial(1, 0.42, 500),
    'attended_group_discussion': np.random.binomial(1, 0.38, 500),
    'qa_participation_pct': np.clip(np.random.normal(35, 12, 500), 5, 60).astype(int),
    'dropout': 1
})

# Combine and shuffle
df = pd.concat([non_dropouts, dropouts]).sample(frac=1, random_state=42).reset_index(drop=True)

# Generate recommendations (same logic as before)
def generate_recommendations(row):
    recs = []
    if row['lecture_watch_pct'] < 70:
        recs.append("Increase your lecture video completion to at least 80% to strengthen understanding.")
    if row['checklist_pct'] < 70:
        recs.append("Make sure to complete all checklist items to stay on track.")
    if row['attended_live_class'] == 0:
        recs.append("Attend live classes to get real-time support and stay engaged.")
    if row['attended_group_discussion'] == 0:
        recs.append("Join group discussions to improve collaboration and communication skills.")
    if row['qa_participation_pct'] < 60:
        recs.append("Participate more in Q&A to clarify doubts and boost retention.")
    return ";".join(recs) if recs else "Good Job, You did Well in the Week. Keep the momentum"

df['recommended_activities'] = df.apply(generate_recommendations, axis=1)

# Save to CSV
df.to_csv("balanced_student_dataset.csv", index=False)