In [1]:
import pandas as pd

# =============================================================================
# 1. EXTRACT
# Load the raw data from the CSV file.
# =============================================================================
try:
    df = pd.read_csv('online_course_engagement_data.csv')
    print("Extract: Raw data loaded successfully.")
except FileNotFoundError:
    print("Error: 'online_course_engagement_data.csv' not found.")
    exit()

# =============================================================================
# 2. TRANSFORM
# Clean, enrich, and structure the data for analysis.
# =============================================================================

# --- 2.1. Data Cleaning ---
# Remove any duplicate records to ensure data quality.
initial_rows = len(df)
df.drop_duplicates(inplace=True)
final_rows = len(df)
print(f"Transform: Removed {initial_rows - final_rows} duplicate rows.")

# --- 2.2. Feature Engineering ---
# Create new, insightful columns for better analysis.

# Create a composite 'EngagementScore' by normalizing and combining key metrics.
df['TimeSpentNorm'] = (df['TimeSpentOnCourse'] - df['TimeSpentOnCourse'].min()) / (df['TimeSpentOnCourse'].max() - df['TimeSpentOnCourse'].min())
df['VideosNorm'] = (df['NumberOfVideosWatched'] - df['NumberOfVideosWatched'].min()) / (df['NumberOfVideosWatched'].max() - df['NumberOfVideosWatched'].min())
df['QuizzesNorm'] = (df['NumberOfQuizzesTaken'] - df['NumberOfQuizzesTaken'].min()) / (df['NumberOfQuizzesTaken'].max() - df['NumberOfQuizzesTaken'].min())
df['EngagementScore'] = (df['TimeSpentNorm'] + df['VideosNorm'] + df['QuizzesNorm']) / 3

# Create categorical 'EngagementLevel' for easier filtering and analysis.
df['EngagementLevel'] = pd.qcut(df['EngagementScore'], q=3, labels=['Low', 'Medium', 'High'])

# Create descriptive columns for better readability in dashboards.
df['DeviceTypeDesc'] = df['DeviceType'].map({0: 'Desktop', 1: 'Mobile'})
df['CourseCompletionDesc'] = df['CourseCompletion'].map({0: 'Not Completed', 1: 'Completed'})

# Drop temporary normalization columns.
df.drop(['TimeSpentNorm', 'VideosNorm', 'QuizzesNorm'], axis=1, inplace=True)

print("Transform: Feature engineering complete.")

# --- 2.3. Structure for BI (Star Schema) ---
# Separate the data into Fact and Dimension tables for optimal BI performance.

# DIM_USER: One row per unique user.
dim_user = df[['UserID']].copy()
dim_user.drop_duplicates(inplace=True)
dim_user['UserSK'] = range(1, len(dim_user) + 1) # Surrogate Key

# DIM_COURSE: One row per unique course category.
dim_course = df[['CourseCategory']].copy()
dim_course.drop_duplicates(inplace=True)
dim_course['CourseSK'] = range(1, len(dim_course) + 1) # Surrogate Key
dim_course.rename(columns={'CourseCategory': 'CategoryName'}, inplace=True)

# DIM_DEVICE: One row per unique device type.
dim_device = df[['DeviceType']].copy()
dim_device.drop_duplicates(inplace=True)
dim_device['DeviceSK'] = range(1, len(dim_device) + 1) # Surrogate Key
dim_device['DeviceTypeDesc'] = dim_device['DeviceType'].map({0: 'Desktop', 1: 'Mobile'})

# FACT_ENGAGEMENT: The central table containing metrics.
# Merge original data with dimension tables to get their surrogate keys.
fact_engagement = df.merge(dim_user, on='UserID')
fact_engagement = fact_engagement.merge(dim_course, left_on='CourseCategory', right_on='CategoryName')
fact_engagement = fact_engagement.merge(dim_device, on='DeviceType')

# Select final columns for the fact table.
fact_engagement = fact_engagement[[
    'UserSK', 'CourseSK', 'DeviceSK',
    'TimeSpentOnCourse',
    'NumberOfVideosWatched',
    'NumberOfQuizzesTaken',
    'QuizScores',
    'CompletionRate',
    'EngagementScore',
    'CourseCompletion'
]]

print("Transform: Star Schema created (Fact and Dimension tables).")

# =============================================================================
# 3. LOAD
# Save the transformed data into new, clean CSV files.
# =============================================================================

output_files = {
    'fact_engagement.csv': fact_engagement,
    'dim_user.csv': dim_user,
    'dim_course.csv': dim_course,
    'dim_device.csv': dim_device
}

for filename, dataframe in output_files.items():
    dataframe.to_csv(filename, index=False)

print("Load: Transformed data saved to CSV files.")
print("\nETL Process Complete. Ready for BI connection.")

Extract: Raw data loaded successfully.
Transform: Removed 877 duplicate rows.
Transform: Feature engineering complete.
Transform: Star Schema created (Fact and Dimension tables).
Load: Transformed data saved to CSV files.

ETL Process Complete. Ready for BI connection.
