# DATA ENGINEERING 

In [348]:
%pip install psycopg2 pandas openpyxl python-dotenv logging

Note: you may need to restart the kernel to use updated packages.


# Configure Logger

In [349]:
import logging

# Set up logging
logging.basicConfig(
    filename='elt_process.log',  # Log file
    level=logging.DEBUG,          # Log level
    format='%(asctime)s - %(levelname)s - %(message)s'  # Log format
)

# Create a logger
logger = logging.getLogger()


## RAW LAYER

### Read the tables from the source DB and store it in raw layer

In [350]:
import psycopg2
import pandas as pd
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Fetching the credentials from the environment variables
dbname = os.getenv('DB_NAME')
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')

try:
    # Connect to your PostgreSQL database
    connection = psycopg2.connect(
        dbname=dbname,
        user=user,
        password=password,
        host=host,
        port=port
    )

    # Create a cursor object
    cursor = connection.cursor()

    cursor.execute("""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'public';
    """)

    tables = cursor.fetchall()

    output_dir = './1.Raw'  # Adjusted output directory to 'data'
    os.makedirs(output_dir, exist_ok=True)

    for table in tables:
        table_name = table[0]
        if table_name == 'User_Accounts':
            query = f'SELECT employee_id,role FROM public."{table_name}"'
        else:
            query = f'SELECT * FROM public."{table_name}"'
        df = pd.read_sql(query, connection)

        df = df.astype(str)  # Convert all columns to string

        # Save each table as a separate CSV file
        csv_file = os.path.join(output_dir, f"{table_name}.csv")
        df.to_csv(csv_file, index=False)  # Saving as CSV
        print(f"Saved: {csv_file}")

    logger.info(f"Data has been saved to {output_dir} as separate CSV files.")

except Exception as e:
    print(f"An error occurred: {e}")
    logger.error("Unable to load data into raw layer: %s", e)

finally:
    # Close the cursor and connection
    if cursor:
        cursor.close()
    if connection:
        connection.close()


Saved: ./1.Raw\_prisma_migrations.csv
Saved: ./1.Raw\Employees.csv
Saved: ./1.Raw\Courses.csv
Saved: ./1.Raw\Designation_Courses.csv
Saved: ./1.Raw\Course_Performances.csv
Saved: ./1.Raw\Project_Performances.csv
Saved: ./1.Raw\Resignation_Records.csv
Saved: ./1.Raw\User_Accounts.csv


  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)


# Prep Layer

### Read the Data from Raw layer

In [351]:
import pandas as pd
input_dir = './1.Raw'

try:
    course_performances = pd.read_csv(f'{input_dir}/Course_Performances.csv')
    courses = pd.read_csv(f'{input_dir}/Courses.csv')
    designation_courses = pd.read_csv(f'{input_dir}/Designation_Courses.csv')
    employees = pd.read_csv(f'{input_dir}/Employees.csv')
    project_performance = pd.read_csv(f'{input_dir}/Project_Performances.csv')
    resignation_records = pd.read_csv(f'{input_dir}/Resignation_Records.csv')
    user_account = pd.read_csv(f'{input_dir}/User_Accounts.csv')  
    logger.info("Data has been read from Raw into Staging")
except Exception as e:
    print(f"An error occurred: {e}")
    logger.error("Error Reading data from Raw: %s", e)
      

## Cleaning

### Basic cleaning

In [352]:
import pandas as pd
import logging

# Set up logging
logging.basicConfig(
    filename='elt_process.log',  # Log file
    level=logging.DEBUG,          # Log level
    format='%(asctime)s - %(levelname)s - %(message)s'  # Log format
)

# Create a logger
logger = logging.getLogger()

def clean_dataframe(df, df_name):
    logger.info(f"Cleaning DataFrame: {df_name}")

    # Check for missing values
    logger.info("Missing values before cleaning:")
    missing_before = df.isnull().sum()
    logger.info(f"{missing_before}")

    # Remove duplicates
    df.drop_duplicates(inplace=True)
    logger.info("Duplicates removed.")

    # Rename columns (optional, based on your needs)
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    logger.info("Columns renamed to lower case and spaces replaced with underscores.")

    # Trim whitespaces from string columns
    string_cols = df.select_dtypes(include=[object]).columns
    df[string_cols] = df[string_cols].apply(lambda x: x.str.strip())
    logger.info("Whitespace trimmed from string columns.")

    # Check for missing values after cleaning
    logger.info("Missing values after cleaning:")
    missing_after = df.isnull().sum()
    logger.info(f"{missing_after}")

    return df

# Clean each DataFrame with logging
course_performances_stage = clean_dataframe(course_performances, "course_performances")
courses_stage = clean_dataframe(courses, "courses")
designation_courses_stage = clean_dataframe(designation_courses, "designation_courses")
employees_stage = clean_dataframe(employees, "employees")
project_performance_stage = clean_dataframe(project_performance, "project_performance")
resignation_records_stage = clean_dataframe(resignation_records, "resignation_records")
user_account_stage = clean_dataframe(user_account, "user_account")


### Cleaning Course Performance

In [353]:
try:
    # Convert 'score' column to integer
    course_performances_stage['score'] = course_performances_stage['score'].astype(int)
    logger.info("Converted 'score' column to integer type.")

    # Convert 'completion_date' column to datetime
    course_performances_stage['completion_date'] = pd.to_datetime(course_performances_stage['completion_date'], errors='coerce')
    logger.info("Converted 'completion_date' column to datetime type.")

    # Log the data types
    dtypes_info = course_performances_stage.dtypes
    logger.info(f"Data types after conversion:\n{dtypes_info}")

except Exception as e:
    logger.error(f"Error during type conversion: {e}")
    

### Cleaning Course

In [354]:
try:
    # Convert 'duration_hours' column to integer
    courses_stage['duration_hours'] = courses_stage['duration_hours'].astype(int)
    logger.info("Converted 'duration_hours' column to integer type.")

    # Log the data types
    dtypes_info = courses_stage.dtypes
    logger.info(f"Data types after conversion:\n{dtypes_info}")

except Exception as e:
    logger.error(f"Error during type conversion: {e}")


### Cleaning Designation Courses

In [355]:
try:
    # Convert 'course_id' column to integer
    designation_courses_stage['course_id'] = designation_courses_stage['course_id'].astype(int)
    logger.info("Converted 'course_id' column to integer type.")

    # Log the data types
    dtypes_info = designation_courses_stage.dtypes
    logger.info(f"Data types after conversion:\n{dtypes_info}")

except Exception as e:
    logger.error(f"Error during type conversion: {e}")

### Cleaning Employees and User Account

In [356]:
try:
    # Clean employees_stage
    employees_stage['id'] = employees_stage['id'].astype(int)
    logger.info("Converted 'id' column in employees_stage to integer type.")

    employees_stage['hire_date'] = pd.to_datetime(employees_stage['hire_date'], errors='coerce')
    logger.info("Converted 'hire_date' column in employees_stage to datetime type.")

    # Clean user_account_stage
    user_account_stage['employee_id'] = user_account_stage['employee_id'].astype(int)
    logger.info("Converted 'employee_id' column in user_account_stage to integer type.")

    # Merge the tables
    employee_details_stage = pd.merge(employees_stage, user_account_stage, left_on="id", right_on="employee_id", how="inner")
    logger.info("Merged employees_stage and user_account_stage.")

    employee_details_stage = employee_details_stage.drop(columns=['employee_id'])
    logger.info("Dropped 'employee_id' column from employee_details_stage.")

    # Log the data types of the resulting DataFrame
    dtypes_info = employee_details_stage.dtypes
    logger.info(f"Data types in employee_details_stage:\n{dtypes_info}")

except Exception as e:
    logger.error(f"Error during data cleaning and merging: {e}")

### Cleaning Project Performance

In [357]:
try:
    # Convert all columns to integer type
    project_performance_stage = project_performance_stage.astype(int)
    logger.info("Converted all columns in project_performance_stage to integer type.")

    # Log the data types of the resulting DataFrame
    dtypes_info = project_performance_stage.dtypes
    logger.info(f"Data types in project_performance_stage:\n{dtypes_info}")

except Exception as e:
    logger.error(f"Error during type conversion in project_performance_stage: {e}")

### Cleaning Resignation Records

In [358]:
try:
    # Convert 'id' column to integer
    resignation_records_stage['id'] = resignation_records_stage['id'].astype(int)
    logger.info("Converted 'id' column in resignation_records_stage to integer type.")

    # Convert 'employee_id' column to integer
    resignation_records_stage['employee_id'] = resignation_records_stage['employee_id'].astype(int)
    logger.info("Converted 'employee_id' column in resignation_records_stage to integer type.")

    # Convert 'resignation_date' column to datetime
    resignation_records_stage['resignation_date'] = pd.to_datetime(resignation_records_stage['resignation_date'], errors='coerce')
    logger.info("Converted 'resignation_date' column in resignation_records_stage to datetime type.")

    # Log the data types of the resulting DataFrame
    dtypes_info = resignation_records_stage.dtypes
    logger.info(f"Data types in resignation_records_stage:\n{dtypes_info}")

except Exception as e:
    logger.error(f"Error during data cleaning in resignation_records_stage: {e}")

## Save the data

In [359]:
output_dir = './2.Staging' 

os.makedirs(output_dir, exist_ok=True)

try:
    course_performances_stage.to_csv(f'{output_dir}/Course_Performances.csv', index=False)
    logger.info("Saved Course_Performances.csv")

    courses_stage.to_csv(f'{output_dir}/Courses.csv', index=False)
    logger.info("Saved Courses.csv")

    designation_courses_stage.to_csv(f'{output_dir}/Designation_Courses.csv', index=False)
    logger.info("Saved Designation_Courses.csv")

    employees_stage.to_csv(f'{output_dir}/Employees_Details.csv', index=False)
    logger.info("Saved Employees_Details.csv")

    project_performance_stage.to_csv(f'{output_dir}/Project_Performances.csv', index=False)
    logger.info("Saved Project_Performances.csv")

    resignation_records_stage.to_csv(f'{output_dir}/Resignation_Records.csv', index=False)
    logger.info("Saved Resignation_Records.csv")

    user_account_stage.to_csv(f'{output_dir}/User_Account.csv', index=False)
    logger.info("Saved User_Account.csv")

    print("Staging has been saved to CSV.")
    logger.info("Staging has been saved to CSV.")

except Exception as e:
    logger.error(f"Error saving DataFrames to CSV: {e}")

Staging has been saved to CSV.


# Report layer

### Read Data from Perp Layer

In [360]:
# import pandas as pd
# input_dir = './2.Prep'

# dataframes = []

# course_performances_prep = pd.read_csv(f'{input_dir}/Course_Performances.csv')
# courses_prep = pd.read_csv(f'{input_dir}/Courses.csv')
# designation_courses_prep = pd.read_csv(f'{input_dir}/Designation_Courses.csv')
# employees_prep = pd.read_csv(f'{input_dir}/Employees.csv')
# project_performance_prep = pd.read_csv(f'{input_dir}/Project_Performances.csv')
# resignation_records_prep = pd.read_csv(f'{input_dir}/Resignation_Records.csv')
# user_account_prep = pd.read_csv(f'{input_dir}/User_Accounts.csv')    

In [361]:
try:
    # Prepare DataFrames
    course_performances_prep = course_performances_stage
    logger.info("Prepared course_performances_prep DataFrame.")

    courses_prep = courses_stage
    logger.info("Prepared courses_prep DataFrame.")

    designation_courses_prep = designation_courses_stage
    logger.info("Prepared designation_courses_prep DataFrame.")

    employees_prep = employee_details_stage
    logger.info("Prepared employees_prep DataFrame.")

    project_performance_prep = project_performance_stage
    logger.info("Prepared project_performance_prep DataFrame.")

    resignation_records_prep = resignation_records_stage
    logger.info("Prepared resignation_records_prep DataFrame.")

except Exception as e:
    logger.error(f"Error during DataFrame preparation: {e}")

## Fact Tables

### Course Performance

In [362]:
try:
    # Drop the 'id' column
    course_performances_prep = course_performances_prep.drop(columns=['id'])
    logger.info("Dropped 'id' column from course_performances_prep DataFrame.")

    # Display the first few rows of the DataFrame
    logger.info("Displaying first few rows of course_performances_prep:")
    print(course_performances_prep.head())

except KeyError as e:
    logger.error(f"Error: The specified column was not found in course_performances_prep - {e}")
except Exception as e:
    logger.error(f"Error during operation on course_performances_prep: {e}")

   employee_id  course_id course_status  score completion_date
0            1          6     completed     74      2024-03-02
1            1         10    incomplete      0      2020-09-22
2            1          8        failed      0      2023-06-14
3            1          9        failed      0      2019-12-10
4            2          4        failed      0      2023-12-15


### Project Performance

In [363]:
try:
    # Drop the 'id' column
    project_performance_prep = project_performance_prep.drop(columns=['id'])
    logger.info("Dropped 'id' column from project_performance_prep DataFrame.")

    # Display the first few rows of the DataFrame
    logger.info("Displaying first few rows of project_performance_prep:")
    print(project_performance_prep.head())

except KeyError as e:
    logger.error(f"Error: The specified column was not found in project_performance_prep - {e}")
except Exception as e:
    logger.error(f"Error during operation on project_performance_prep: {e}")

   employee_id  project_id  engagement_score  teamwork_score  \
0            1           1                96              42   
1            1           2                67              37   
2            1           3                60              92   
3            2           1                90              21   
4            2           2                92              30   

   punctuality_score  overall_performance_score  
0                 89                         23  
1                 52                         36  
2                 30                         10  
3                 10                         19  
4                 62                         96  


### Resignation Records

In [364]:
try:
    # Drop the 'id' column
    resignation_records_prep = resignation_records_prep.drop(columns=['id'])
    logger.info("Dropped 'id' column from resignation_records_prep DataFrame.")

    # Display the first few rows of the DataFrame
    logger.info("Displaying first few rows of resignation_records_prep:")
    print(resignation_records_prep.head())

except KeyError as e:
    logger.error(f"Error: The specified column was not found in resignation_records_prep - {e}")
except Exception as e:
    logger.error(f"Error during operation on resignation_records_prep: {e}")

   employee_id resignation_date  \
0           35       2021-06-04   
1           80       2024-06-20   
2           62       2024-08-25   
3           51       2024-07-23   
4           27       2024-08-06   

                                              reason  
0                  Any law entire. School half kind.  
1  Range table matter discover follow different box.  
2             Structure act expert require approach.  
3                                   About away here.  
4              She dinner whose like budget certain.  


## Dimension Table

### Employee Details

In [365]:
try:
    # Drop the 'role' column
    employees_prep = employees_prep.drop(columns=['role'])
    logger.info("Dropped 'role' column from employees_prep DataFrame.")

    # Rename the 'id' column to 'employee_id'
    employees_prep.rename(columns={'id': 'employee_id'}, inplace=True)
    logger.info("Renamed 'id' column to 'employee_id' in employees_prep DataFrame.")

    # Display DataFrame information
    logger.info("Displaying information of employees_prep DataFrame:")
    employees_prep_info = employees_prep.info()  # This will print the info to console
    logger.info(f"DataFrame information:\n{employees_prep_info}")

except KeyError as e:
    logger.error(f"Error: The specified column was not found in employees_prep - {e}")
except Exception as e:
    logger.error(f"Error during operation on employees_prep: {e}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   employee_id        100 non-null    int64         
 1   first_name         100 non-null    object        
 2   last_name          100 non-null    object        
 3   department         100 non-null    object        
 4   designation_type   100 non-null    object        
 5   hire_date          100 non-null    datetime64[ns]
 6   employment_status  100 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 5.6+ KB


### Course Details

In [366]:
try:
    # Merge DataFrames
    courses_prep = pd.merge(courses_prep, designation_courses_prep, left_on='id', right_on='course_id')
    logger.info("Merged courses_prep with designation_courses_prep DataFrame on 'id' and 'course_id'.")

    # Drop the 'course_id' column
    courses_prep = courses_prep.drop(columns=['course_id'])
    logger.info("Dropped 'course_id' column from courses_prep DataFrame.")

    # Rename the 'id' column to 'course_id'
    courses_prep.rename(columns={'id': 'course_id'}, inplace=True)
    logger.info("Renamed 'id' column to 'course_id' in courses_prep DataFrame.")

    # Display DataFrame information
    logger.info("Displaying information of courses_prep DataFrame:")
    courses_prep_info = courses_prep.info()  # This will print the info to console
    logger.info(f"DataFrame information:\n{courses_prep_info}")

except KeyError as e:
    logger.error(f"Error: The specified column was not found during the merge operation in courses_prep - {e}")
except Exception as e:
    logger.error(f"Error during operation on courses_prep: {e}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   course_id           16 non-null     int64 
 1   course_name         16 non-null     object
 2   course_description  16 non-null     object
 3   duration_hours      16 non-null     int64 
 4   designation_type    16 non-null     object
dtypes: int64(2), object(3)
memory usage: 772.0+ bytes


## Save the data

In [367]:
output_dir = './3.Report'

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

try:
    # Save cleaned DataFrames to CSV
    course_performances_prep.to_csv(f'{output_dir}/fact_Course_Performances.csv', index=False)
    logger.info("Saved cleaned course_performances_prep DataFrame to fact_Course_Performances.csv.")
    
    courses_prep.to_csv(f'{output_dir}/dim_Courses.csv', index=False)
    logger.info("Saved cleaned courses_prep DataFrame to dim_Courses.csv.")
    
    employees_prep.to_csv(f'{output_dir}/dim_Employees_Details.csv', index=False)
    logger.info("Saved cleaned employees_prep DataFrame to dim_Employees_Details.csv.")
    
    project_performance_prep.to_csv(f'{output_dir}/fact_Project_Performances.csv', index=False)
    logger.info("Saved cleaned project_performance_prep DataFrame to fact_Project_Performances.csv.")
    
    resignation_records_prep.to_csv(f'{output_dir}/fact_Resignation_Records.csv', index=False)
    logger.info("Saved cleaned resignation_records_prep DataFrame to fact_Resignation_Records.csv.")

    logger.info("Data Mart has been saved to CSV.")

except Exception as e:
    logger.error(f"Error during saving Data Mart to CSV: {e}")

# Final Analysis

In [371]:
output_dir = './4.Final_Report'

os.makedirs(output_dir, exist_ok=True)

### Designation Count by Course and Status

In [372]:
def designation_count_by_course_and_status(course_performances_stage, employees_stage):
    try:
        logger.info("Starting the designation count by course and status.")

        # Merge DataFrames
        employee_with_designation = course_performances_stage.merge(
            employees_stage[['employee_id', 'designation_type']],
            left_on='employee_id',
            right_on='employee_id'
        )
        logger.info("Merged course_performances_stage with employees_stage to include designation types.")

        # Group by course_id, designation_type, and course_status
        designation_count = employee_with_designation.groupby(
            ['course_id', 'designation_type', 'course_status']
        ).agg(employees_count=('employee_id', 'count')).reset_index()
        logger.info("Grouped data to count employees by course and designation.")

        logger.info("Completed designation count by course and status.")

        return designation_count.sort_values(['course_id', 'designation_type'])

    except Exception as e:
        logger.error(f"Error in designation_count_by_course_and_status: {e}")

# Example usage with DataFrames
designation_count = designation_count_by_course_and_status(course_performances_prep, employees_prep)

designation_count.to_csv(f'{output_dir}/Designation_Count.csv', index=False)
logger.info("Saved report to Designation_Count.csv.")

logger.info("Displaying first few rows of designation_count DataFrame:")
print(designation_count.head())

   course_id designation_type course_status  employees_count
0          1                A     completed               11
1          1                A        failed               10
2          1                A    incomplete                2
3          2                D     completed                9
4          2                D        failed                9


### Course Status Count by Course

In [373]:
def course_status_count_by_course(course_performances_stage, courses_stage):
    try:
        logger.info("Starting the course status count by course.")

        # Merge DataFrames
        course_count = course_performances_stage.merge(
            courses_stage[['course_id', 'course_name', 'course_description', 'duration_hours']],
            left_on='course_id',
            right_on='course_id'
        )
        logger.info("Merged course_performances_stage with courses_stage to include course details.")

        # Group by course_id and course_status
        course_status_count = course_count.groupby(
            ['course_id', 'course_name', 'course_description', 'duration_hours', 'course_status']
        ).agg(status_count=('course_status', 'count')).reset_index()
        logger.info("Grouped data to count course status.")

        logger.info("Completed course status count by course.")
        return course_status_count.sort_values(['course_id', 'status_count'], ascending=[True, False])

    except Exception as e:
        logger.error(f"Error in course_status_count_by_course: {e}")

# Example usage with DataFrames
course_status_count = course_status_count_by_course(course_performances_prep, courses_prep)

course_status_count.to_csv(f'{output_dir}/Course_Completion_status.csv', index=False)
logger.info("Saved report to Course_Completion_status.csv.")

logger.info("Displaying first few rows of course_status_count DataFrame:")
print(course_status_count.head())

   course_id course_name                                 course_description  \
0          1    Course 1  Pm major always man open speech seat. Camera i...   
1          1    Course 1  Pm major always man open speech seat. Camera i...   
2          1    Course 1  Pm major always man open speech seat. Camera i...   
3          2    Course 2  Environment charge technology several data dre...   
4          2    Course 2  Environment charge technology several data dre...   

   duration_hours course_status  status_count  
0              26     completed            11  
1              26        failed            10  
2              26    incomplete             2  
3              15     completed             9  
4              15        failed             9  


###  Top Scorer and Average Score by Designation

In [374]:
def top_scorer_and_avg_by_designation(course_performances_stage, employees_stage, courses_stage):
    try:
        logger.info("Starting the calculation of top scorer and average by designation.")

        # Filter for completed courses
        completed_courses = course_performances_stage[course_performances_stage['course_status'] == 'completed']
        logger.info(f"Filtered completed courses: {len(completed_courses)} records found.")

        # Merge to get designation types
        employee_with_designation = completed_courses.merge(
            employees_stage[['employee_id', 'designation_type']],
            left_on='employee_id',
            right_on='employee_id'
        )
        logger.info("Merged completed courses with employees to include designation types.")

        # Calculate average score
        average_score = employee_with_designation.groupby(
            ['course_id', 'designation_type']
        ).agg(avg_score=('score', 'mean')).reset_index()
        logger.info("Calculated average score by course and designation.")

        # Calculate row numbers by score descending
        employee_with_scores = completed_courses.merge(
            employees_stage[['employee_id', 'first_name', 'last_name', 'designation_type']],
            left_on='employee_id',
            right_on='employee_id'
        )
        employee_with_scores['row_number'] = employee_with_scores.groupby(
            ['course_id', 'designation_type']
        )['score'].rank(method='first', ascending=False)
        logger.info("Assigned row numbers based on scores.")

        # Filter for top scores
        top_score = employee_with_scores[employee_with_scores['row_number'] == 1]
        logger.info(f"Identified top scorers: {len(top_score)} records found.")

        # Merge top score with average score and course names
        final_table = top_score.merge(
            average_score[['course_id', 'designation_type', 'avg_score']],
            on=['course_id', 'designation_type'],
            how='left'
        ).merge(
            courses_stage[['course_id', 'course_name']],
            on='course_id',
            how='left'
        )
        logger.info("Merged top scorers with average scores and course names.")

        final_table['Department_topper'] = final_table['first_name'] + ' ' + final_table['last_name']

        final_columns = ['course_id', 'course_name', 'designation_type', 'Department_topper', 'score', 'avg_score']
        final_table = final_table.drop_duplicates().reset_index(drop=True)

        logger.info("Completed calculation of top scorer and average by designation.")
        
        return final_table[final_columns].rename(columns={'designation_type': 'Department', 'score': 'top_score'})

    except Exception as e:
        logger.error(f"Error in top_scorer_and_avg_by_designation: {e}")

top_scorer_avg = top_scorer_and_avg_by_designation(course_performances_prep, employees_prep, courses_prep)

top_scorer_avg.to_csv(f'{output_dir}/Top_And_Average_Score.csv', index=False)
logger.info("Saved report to Top_And_Average_Score.csv.")

logger.info("Displaying top scorer and average DataFrame:")
print(top_scorer_avg)

    course_id course_name Department   Department_topper  top_score  avg_score
0           5    Course 5          D          Lisa Moses         99  81.500000
1          11   Course 11          A      Kristin Lowery         90  72.333333
2           8    Course 8          C  Arthur Fitzpatrick        100  68.500000
3           9    Course 9          C  Arthur Fitzpatrick         94  68.857143
4          10   Course 10          C         Steven Hart         98  74.857143
5           4    Course 4          D      Kathleen Myers         97  85.166667
6           2    Course 2          D      Kathleen Myers        100  78.000000
7           6    Course 6          C    Latoya Rodriguez         92  72.000000
8           9    Course 9          B      Madison Hooper         97  80.500000
9          11   Course 11          B      Brittany Smith        100  79.125000
10          7    Course 7          B     Melinda Sanchez         95  74.666667
11          8    Course 8          B        Lucas Mo