In [4]:
import pandas as pd
import numpy as np
from datetime import datetime

print("="*70)
print("OULAD ETL PIPELINE - ONLINE EDUCATION ANALYTICS")
print("="*70)
print(f"Run time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


OULAD ETL PIPELINE - ONLINE EDUCATION ANALYTICS
Run time: 2025-12-20 23:47:38


In [5]:
print("\n[EXTRACT] Loading OULAD datasets...")

students = pd.read_csv("studentInfo.csv")
courses = pd.read_csv("courses.csv")
assessments = pd.read_csv("assessments.csv")
student_assessment = pd.read_csv("studentAssessment.csv")
vle = pd.read_csv("vle.csv")  # metadata only (safe)
student_registration = pd.read_csv("studentRegistration.csv")

print("✓ All required files loaded successfully")
print(f"   Students: {len(students)}")
print(f"   Courses: {len(courses)}")
print(f"   Assessments: {len(assessments)}")
print(f"   Student Assessments: {len(student_assessment)}")
print(f"   VLE Resources: {len(vle)}")
print(f"   Registrations: {len(student_registration)}")



[EXTRACT] Loading OULAD datasets...
✓ All required files loaded successfully
   Students: 32593
   Courses: 22
   Assessments: 206
   Student Assessments: 173912
   VLE Resources: 6364
   Registrations: 32593


In [6]:
print("\n[TRANSFORM] Data quality checks...")

students.drop_duplicates(inplace=True)
student_assessment.drop_duplicates(inplace=True)
courses.drop_duplicates(inplace=True)

print("✓ Duplicates removed")

print("\nMissing values check:")
print(students.isnull().sum()[students.isnull().sum() > 0])




[TRANSFORM] Data quality checks...
✓ Duplicates removed

Missing values check:
imd_band    1111
dtype: int64


In [7]:
print("\n[TRANSFORM] Creating dim_student...")

dim_student = students[[
    'id_student', 'gender', 'region', 'highest_education',
    'imd_band', 'age_band', 'num_of_prev_attempts', 'disability'
]].drop_duplicates(subset=['id_student'])

dim_student.fillna({
    'gender': 'Unknown',
    'region': 'Unknown',
    'highest_education': 'Unknown',
    'imd_band': 'Unknown',
    'age_band': 'Unknown',
    'disability': 'N',
    'num_of_prev_attempts': 0
}, inplace=True)

dim_student.insert(0, 'StudentSK', range(1, len(dim_student) + 1))

print(f"✓ dim_student created: {len(dim_student)} rows")



[TRANSFORM] Creating dim_student...
✓ dim_student created: 28785 rows


In [8]:
print("\n[TRANSFORM] Creating dim_course...")

dim_course = courses.copy()

dim_course['course_presentation_id'] = (
    dim_course['code_module'] + "_" + dim_course['code_presentation']
)

dim_course.insert(0, 'CourseSK', range(1, len(dim_course) + 1))

dim_course = dim_course[[
    'CourseSK', 'course_presentation_id',
    'code_module', 'code_presentation',
    'module_presentation_length'
]]

print(f"✓ dim_course created: {len(dim_course)} rows")



[TRANSFORM] Creating dim_course...
✓ dim_course created: 22 rows


In [9]:
print("\n[TRANSFORM] Creating dim_assessment...")

dim_assessment = assessments.copy()

dim_assessment['assessment_full_id'] = (
    dim_assessment['code_module'] + "_" +
    dim_assessment['code_presentation'] + "_" +
    dim_assessment['id_assessment'].astype(str)
)

dim_assessment['assessment_type'] = dim_assessment['assessment_type'].fillna('Unknown')

dim_assessment.insert(0, 'AssessmentSK', range(1, len(dim_assessment) + 1))

print(f"✓ dim_assessment created: {len(dim_assessment)} rows")



[TRANSFORM] Creating dim_assessment...
✓ dim_assessment created: 206 rows


In [10]:
print("\n[TRANSFORM] Creating fact_student_assessment...")

fact_assess = student_assessment.merge(
    students[['id_student','code_module','code_presentation']],
    on='id_student',
    how='left'
)

fact_assess = fact_assess.merge(
    dim_student[['id_student','StudentSK']],
    on='id_student',
    how='left'
)

fact_assess['course_presentation_id'] = (
    fact_assess['code_module'] + "_" + fact_assess['code_presentation']
)

fact_assess = fact_assess.merge(
    dim_course[['course_presentation_id','CourseSK']],
    on='course_presentation_id',
    how='left'
)

fact_assess['assessment_full_id'] = (
    fact_assess['code_module'] + "_" +
    fact_assess['code_presentation'] + "_" +
    fact_assess['id_assessment'].astype(str)
)

fact_assess = fact_assess.merge(
    dim_assessment[['assessment_full_id','AssessmentSK','weight']],
    on='assessment_full_id',
    how='left'
)

fact_assess['score'] = pd.to_numeric(fact_assess['score'], errors='coerce').fillna(0)
fact_assess['is_passed'] = (fact_assess['score'] >= 40).astype(int)

fact_student_assessment = fact_assess[[
    'StudentSK','CourseSK','AssessmentSK',
    'score','weight','is_passed'
]].dropna()

print(f"✓ fact_student_assessment created: {len(fact_student_assessment)} rows")



[TRANSFORM] Creating fact_student_assessment...
✓ fact_student_assessment created: 173912 rows


In [11]:
print("\n[TRANSFORM] Creating fact_student_course...")

fact_course = students.merge(
    dim_student[['id_student','StudentSK']],
    on='id_student',
    how='left'
)

fact_course['course_presentation_id'] = (
    fact_course['code_module'] + "_" + fact_course['code_presentation']
)

fact_course = fact_course.merge(
    dim_course[['course_presentation_id','CourseSK']],
    on='course_presentation_id',
    how='left'
)

fact_course['is_completed'] = fact_course['final_result'].isin(['Pass','Distinction']).astype(int)

fact_student_course = fact_course[[
    'StudentSK','CourseSK',
    'studied_credits','num_of_prev_attempts',
    'final_result','is_completed'
]].dropna()

print(f"✓ fact_student_course created: {len(fact_student_course)} rows")



[TRANSFORM] Creating fact_student_course...
✓ fact_student_course created: 32593 rows


In [12]:
print("\n[LOAD] Saving cleaned datasets...")

dim_student.to_csv("dim_student.csv", index=False)
dim_course.to_csv("dim_course.csv", index=False)
dim_assessment.to_csv("dim_assessment.csv", index=False)
fact_student_assessment.to_csv("fact_student_assessment.csv", index=False)
fact_student_course.to_csv("fact_student_course.csv", index=False)

print("✓ All files saved successfully")



[LOAD] Saving cleaned datasets...
✓ All files saved successfully


In [14]:
print("\n" + "="*70)
print("ETL PIPELINE COMPLETED SUCCESSFULLY")
print("="*70)

print(f"Students: {len(dim_student)}")
print(f"Courses: {len(dim_course)}")
print(f"Assessments: {len(dim_assessment)}")
print(f"Assessment Facts: {len(fact_student_assessment)}")
print(f"Course Facts: {len(fact_student_course)}")

print("\n✅ READY FOR POWER BI ")



ETL PIPELINE COMPLETED SUCCESSFULLY
Students: 28785
Courses: 22
Assessments: 206
Assessment Facts: 173912
Course Facts: 32593

✅ READY FOR POWER BI 
