In [None]:
# EMPLOYEE - TABLE

# Step 1: Load the data from the CSV file (assumed to be already extracted)
employee_data = pd.read_csv('./staging/Employee.csv')   # change thiss---------------------

# Step 2: Extract relevant columns
cleaned_employee_data = employee_data[['emp_id', 'designation']]

# Step 3: Remove duplicates
cleaned_employee_data = cleaned_employee_data.drop_duplicates(subset='emp_id')

# Step 4: One-Hot Encoding for the designation column
cleaned_employee_data = pd.get_dummies(cleaned_employee_data, columns=['designation'], prefix='designation', drop_first=True)

# Step 5: Convert boolean columns to integers (0, 1)
bool_columns = cleaned_employee_data.columns[1:]  # Assuming the first column is emp_id
cleaned_employee_data[bool_columns] = cleaned_employee_data[bool_columns].astype(int)

# Step 6: Provide information about the cleaned table
print(cleaned_employee_data.info())
print(cleaned_employee_data.head())  # Show the first few rows of the cleaned data

# Optionally, save the cleaned data to a new CSV file
cleaned_employee_data.to_csv('./prep/cleaned_employee_data.csv', index=False)

In [None]:
# COURSE - TABLE

# Step 1: Load the data from the CSV file
courses_data = pd.read_csv('./staging/Course.csv')

# Step 2: Extract relevant columns
cleaned_courses_data = courses_data[['course_id', 'duration', 'difficulty_level']]

# Step 3: Perform ordinal encoding for difficulty_level
difficulty_order = ['BASIC', 'BEGINNER', 'INTERMEDIATE', 'EXPERT']  # Define the order of difficulty levels
cleaned_courses_data['difficulty_level'] = pd.Categorical(cleaned_courses_data['difficulty_level'], 
                                                            categories=difficulty_order, 
                                                            ordered=True)
cleaned_courses_data['difficulty_level'] = cleaned_courses_data['difficulty_level'].cat.codes  # Convert to codes

# Step 4: Convert duration to weeks
def duration_to_weeks(duration):
    if 'months' in duration:
        return int(duration.split()[0]) * 4  # Assuming 1 month = 4 weeks
    elif 'years' in duration:
        return int(duration.split()[0]) * 52  # Assuming 1 year = 52 weeks
    elif 'weeks' in duration:
        return int(duration.split()[0])
    else:
        return 1  # Handle any unexpected format

cleaned_courses_data['duration_in_weeks'] = cleaned_courses_data['duration'].apply(duration_to_weeks)

# Step 1: Calculate the mean and standard deviation
mean_duration = cleaned_courses_data['duration_in_weeks'].mean()
std_duration = cleaned_courses_data['duration_in_weeks'].std()

# Step 2: Apply Z-score normalization
cleaned_courses_data['standardized_duration'] = (cleaned_courses_data['duration_in_weeks'] - mean_duration) / std_duration


# Step 5: Clean the DataFrame by dropping the original duration column
cleaned_courses_data = cleaned_courses_data.drop(columns=['duration'])

# Step 6: Provide information about the cleaned table
print(cleaned_courses_data.info())
print(cleaned_courses_data.head())  # Show the first few rows of the cleaned data

# Optionally, save the cleaned data to a new CSV file
cleaned_courses_data.to_csv('./prep/cleaned_courses_data.csv', index=False)