### Importing required libraries ###

In [2]:
import pandas as pd
import faker as f
import random
from datetime import timedelta,datetime
from collections import defaultdict

###  Defining Constants 

In [3]:
NO_STUDENTS=120
NO_PROFESSORS=12
NO_COURSES=15
NO_ASSIGNMENTS=30

In [4]:
fake=f.Faker()

### User Table

In [5]:
users=[{
    'user_id':i+1,
    'name': fake.name(),
    'email':fake.email(),
    'role':'student' if i+1<=NO_STUDENTS else 'professor'}
          for i in range(NO_STUDENTS+NO_PROFESSORS)]

### Courses


### creating courses table

In [6]:
base_subjects=['Math', 'Physics', 'History', 'Biology', 'Chemistry', 'Philosophy', 'Economics', 'Art']

def generate_courses():
    courses=set()
    while len(courses)<NO_COURSES:
        subject=random.choice(base_subjects)
        level=random.choice(['101', '201', '301', 'Advanced', 'Intro to', 'Foundations of'])
        if level.isdigit():
            courses.add(f"{subject} {level}")
        else:
            courses.add(f"{level} {subject}")
    return list(courses)

fake_courses=generate_courses()

In [7]:
courses=[]
professor_load=defaultdict(int)
professor_ids=random.sample([u['user_id'] for u in users if u['role']=='professor'],NO_PROFESSORS)
for i in range(1,NO_COURSES+1):
    available=[pid for pid in professor_ids if professor_load[pid]<2]
    chosen_prof=random.choice(available)
    st_date=fake.date_between(start_date='-90d',end_date='-80d')
    courses.append({
        'course_id':i,
        'course_name':fake_courses[i-1],
        'professor_id':chosen_prof,
        'start_date':st_date
    })
    professor_load[chosen_prof]+=1



### enrollment table


In [8]:
course_ids=[c['course_id'] for c in courses]
st_ids=random.sample([u['user_id'] for u in users if u['role']=='student'],NO_STUDENTS)
enrollments=[]
i=0
for st_id in st_ids:
    num_courses=random.randint(1,3)
    selected_courses=random.sample(course_ids,num_courses)
    for course_id in selected_courses:
        i+=1
        enrollments.append({
            'enrollment_id':i,
            'student_id':st_id,
            'course_id':course_id
        })


### Assignments Table

In [9]:
assignments=[]
for i in range(NO_ASSIGNMENTS):
    course=random.choice(course_ids)
    st_date=next((c['start_date'] for c in courses if c['course_id']==course),None)
    offsetdays=random.randint(-8,8)
    due_date=st_date+timedelta(weeks=12)+timedelta(days=offsetdays)
    assignments.append({
        'assignment_id':i+1,
        'course_id':course,
        'title':f"Assignment{i+1}",
        'due_date':due_date
    }) 
    


### deadlines


In [22]:

deadlines = []
for a in assignments:
    due = a['due_date']
    sub_timeline = due+timedelta(days=7)
    feedback_offset = random.randint(5, 10)
    mark_submit_offset = random.randint(7, 12)
    approval_offset = random.randint(2, 5)
    hand_in_date=fake.date_between(start_date='-90d',end_date='-80d')
    feedback_date = sub_timeline + timedelta(days=feedback_offset)
    marks_submission_date = sub_timeline + timedelta(days=mark_submit_offset)
    marks_approval_date = marks_submission_date + timedelta(days=approval_offset)

    deadlines.append({
        'assignment_id': a['assignment_id'],
        'hand_in_date': hand_in_date,
        'due_date': due,
        'reminder_date': due - timedelta(days=3),
        'feedback_date': feedback_date,
        'marks_submission_date': marks_submission_date,
        'marks_approval_date': marks_approval_date
    })

### Submissions

In [20]:
submissions = []
sub_id = 0
enrollment_lookup = {}
for e in enrollments:
    enrollment_lookup.setdefault(e['course_id'], set()).add(e['student_id'])

for a in assignments:
    course_id = a['course_id']
    due_date = a['due_date']

    enrolled_students = enrollment_lookup.get(course_id, set())
    if not enrolled_students:
        continue 
    num_submitters = random.randint(int(len(enrolled_students) * 0.7), len(enrolled_students))
    submitters = random.sample(list(enrolled_students), num_submitters)

    for user_id in submitters:
        sub_start_date = due_date - timedelta(days=7)
        submission_date = fake.date_between(start_date=sub_start_date, end_date=due_date + timedelta(days=7))
        raw_grade = random.randint(40, 100)
        

        days_late = max(0, (submission_date - due_date).days)
        
        if days_late > 5:
            final_grade = 0
        else:
            final_grade = max(0, raw_grade - (days_late * 5))  

        sub_id += 1
        submissions.append({
            'submission_id': sub_id,
            'user_id': user_id,
            'assignment_id': a['assignment_id'],
            'submission_date': submission_date,
            "receipt_date":submission_date+timedelta(days=random.randint(0,1)),
            'grade': final_grade
            
        })

### Converting to DF's

In [24]:
df_users=pd.DataFrame(users)
df_courses=pd.DataFrame(courses)
df_assignments=pd.DataFrame(assignments)
df_deadlines=pd.DataFrame(deadlines)
df_submissions=pd.DataFrame(submissions)
df_enrollments=pd.DataFrame(enrollments)

### Exporting to CSV

In [25]:
df_users.to_csv('users.csv',index=False)
df_courses.to_csv('courses.csv',index=False)
df_assignments.to_csv('assignments.csv',index=False)
df_deadlines.to_csv('deadlines.csv',index=False)
df_submissions.to_csv('submissions.csv',index=False)
df_enrollments.to_csv('enrollments.csv',index=False)

In [39]:
import pandas as pd

# Load CSVs
submissions = pd.read_csv("submissions.csv")
users = pd.read_csv("users.csv")
assignments = pd.read_csv("assignments.csv")
courses = pd.read_csv("courses.csv")
deadlines = pd.read_csv("deadlines.csv")
enrollments = pd.read_csv("enrollments.csv")

# Merge steps
df = submissions.merge(users, on='user_id', suffixes=('', '_student'))
df = df.merge(assignments, on='assignment_id')
df = df.merge(deadlines, on='assignment_id', how='left')
df = df.merge(courses, on='course_id')

# Join with professor info (self join on users table)
df = df.merge(users, left_on='professor_id', right_on='user_id', suffixes=('', '_professor'))


df = df.merge(enrollments, left_on=['course_id', 'user_id'], right_on=['course_id', 'student_id'], how='left')

# Save to a single CSV
df.to_csv("full_combined_data.csv", index=False)
