In [41]:
import pandas as pd

In [42]:
courses_df = pd.read_csv('../raw/raw_courses.csv')
engagement_df = pd.read_csv('../raw/raw_engagements.csv')
feedback_df = pd.read_csv('../raw/raw_feedback.csv')
users_df = pd.read_csv('../raw/raw_users.csv')
courses_df.head()
# feedback_df.head()
# users_df.head()
# engagement_df.head()

Unnamed: 0,courseId,name,est_duration,postedby,stack,prerequisites
0,1,Likely sort address cost meet.,162 hours,1996-04-23,web development,Basic SQL
1,2,Couple capital.,110 hours,2013-01-06,gen AI,Basic SQL
2,3,Big interest stuff front.,25 hours,2003-02-09,data science,
3,4,Thus arrive fill.,124 hours,2007-10-31,gen AI,HTML
4,5,Five able.,139 hours,1973-03-28,web development,Intermediate Python


In [43]:
# Function to clean the courses dataset
def clean_courses(df):
    # Drop duplicates
    df.drop_duplicates(subset='courseId', keep='first', inplace=True)
    # Fill or drop null values as appropriate
    df.fillna({
        'name': 'Unknown Course',
        'est_duration': '0 hours',
        'postedby': pd.Timestamp.now(),
        'stack': 'General',
        'prerequisites': 'None',
        'blogContent': ''
    }, inplace=True)
    # Convert 'timeSpent' column to string type (this handles mixed types or non-string values)
    df['est_duration'] = df['est_duration'].astype(str)

    # Remove 'hours' (or 'hour') suffix and convert to numeric
    df['est_duration'] = df['est_duration'].str.replace(' hours', '', regex=False).str.replace(' hour', '', regex=False)

    # Convert back to numeric type (if timeSpent should be a number)
    df['est_duration'] = pd.to_numeric(df['est_duration'], errors='coerce')
    return df

# Function to clean the feedback dataset
def clean_feedback(df):
    # Drop duplicates
    df.drop_duplicates(subset=['userId', 'courseId'], keep='first', inplace=True)
    # Fill or drop null values as appropriate
    df.fillna({
        'rating': 0,
        'difficulty': 'Unknown',
        'comments': 'No Comments',
        'interactive': 'No'
    }, inplace=True)
    return df

# Function to clean the users dataset
def clean_users(df):
    # Drop the 'password' column as it is unnecessary for analysis
    if 'password' in df.columns:
        df.drop(columns=['password'], inplace=True)
    # Drop duplicates
    df.drop_duplicates(subset='userId', keep='first', inplace=True)
    # Fill or drop null values as appropriate
    df.fillna({
        'name': 'Unknown User',
        'email': 'unknown@example.com',
        'password': '',
        'role': 'user',
        'department': 'General',
        'designation': 'Employee'
    }, inplace=True)
    return df

# Function to clean the course engagements dataset
def clean_engagements(df):
    # Drop duplicates
    df.drop_duplicates(subset=['userId', 'courseId'], keep='first', inplace=True)
    # Fill or drop null values as appropriate
    df.fillna({
        'timeSpent': 0,
        'score': 0
    }, inplace=True)
    
    return df


In [44]:
cleaned_courses_df = clean_courses(courses_df)
cleaned_feedback_df = clean_feedback(feedback_df)
cleaned_users_df = clean_users(users_df)
cleaned_engagements_df = clean_engagements(engagement_df)

In [45]:
cleaned_courses_df.to_csv('prep_courses.csv', index=False)
cleaned_feedback_df.to_csv('prep_feedback.csv', index=False)
cleaned_users_df.to_csv('prep_users.csv', index=False)
cleaned_engagements_df.to_csv('prep_engagements.csv', index=False)