In [1]:
import pandas as pd

In [2]:
# Load the data
users_df = pd.read_csv('../STAGING/staging_users.csv')  # Adjust path if necessary
training_sessions_df = pd.read_csv('../STAGING/staging_training_sessions.csv')  # Adjust path if necessary

In [3]:
# 1. Merge with users to get trainer details
training_sessions_with_trainer = training_sessions_df.merge(
    users_df[['username', '_id','email', 'designation']], 
    left_on='trainer', 
    right_on='_id', 
    suffixes=('', '_trainer')
).drop(columns=['_id'])

In [4]:
# 2. Directly create a participants DataFrame from the training sessions
participants_df = training_sessions_with_trainer[['training_code', 'participant_user', 
    'participant_hackerRankScore', 'participant_assessmentScore', 
    'participant_performance', 'participant_communication', 'participant_remarks']]

# Check if the participant_user column exists before merging
if 'participant_user' not in participants_df.columns:
    raise KeyError("Column 'participant_user' not found in participants DataFrame.")

# Merge participants details with the main training data
participants_merged_df = participants_df.merge(
    users_df, 
    left_on='participant_user', 
    right_on='_id', 
    suffixes=('_participant', '_user')
).drop(columns=['_id'])

In [5]:
# 3. Aggregations and Calculations
# Average scores for each session
session_avg_scores = participants_merged_df.groupby('training_code').agg(
    avg_hackerRankScore=('participant_hackerRankScore', 'mean'),
    avg_assessmentScore=('participant_assessmentScore', 'mean'),
    avg_performance=('participant_performance', 'mean'),
    avg_communication=('participant_communication', 'mean'),
    total_participants=('participant_user', 'nunique')
).reset_index()

# Merge average scores back to the main DataFrame
training_with_avg_scores = training_sessions_with_trainer.merge(session_avg_scores, on='training_code', how='left')

In [6]:
# 4. Restructuring Data for Reporting
# Reporting by session: Summarize performance of participants per session
session_report = training_with_avg_scores.groupby(['training_code', 'status', 'trainer', 'startDate', 'endDate']).agg(
    total_participants=('participant_user', 'nunique'),
    avg_hackerRankScore=('avg_hackerRankScore', 'first'),
    avg_assessmentScore=('avg_assessmentScore', 'first'),
    avg_performance=('avg_performance', 'first'),
    avg_communication=('avg_communication', 'first')
).reset_index()

# Reporting by trainer: Summarize all sessions handled by each trainer
trainer_report = training_with_avg_scores.groupby(['trainer']).agg(
    total_sessions=('training_code', 'nunique'),
    total_participants=('participant_user', 'nunique'),
    overall_avg_hackerRankScore=('avg_hackerRankScore', 'mean'),
    overall_avg_assessmentScore=('avg_assessmentScore', 'mean'),
    overall_avg_performance=('avg_performance', 'mean'),
    overall_avg_communication=('avg_communication', 'mean')
).reset_index()


In [7]:

# 5. Save Results to CSV
session_report.to_csv('session_report.csv', index=False)
trainer_report.to_csv('trainer_report.csv', index=False)

print("Transformation complete. Reports saved as 'session_report.csv' and 'trainer_report.csv'.")


Transformation complete. Reports saved as 'session_report.csv' and 'trainer_report.csv'.
