# DATA ENGINEERING 

In [1]:
%pip install psycopg2 pandas openpyxl python-dotenv

Collecting psycopg2Note: you may need to restart the kernel to use updated packages.

  Using cached psycopg2-2.9.9-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.1.2-cp312-cp312-win_amd64.whl.metadata (59 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Using cached psycopg2-2.9.9-cp312-cp312-win_amd64.whl (1.2 MB)
Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Using cached numpy-2.1.2-cp312-cp312-win_amd64.whl (12.6 MB)
Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 



## RAW LAYER

### Read the tables from the source DB and store it in raw layer

In [46]:
import psycopg2
import pandas as pd
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Fetching the credentials from the environment variables
dbname = os.getenv('DB_NAME')
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')

try:
    # Connect to your PostgreSQL database
    connection = psycopg2.connect(
        dbname=dbname,
        user=user,
        password=password,
        host=host,
        port=port
    )

    # Create a cursor object
    cursor = connection.cursor()

    cursor.execute("""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'public';
    """)

    tables = cursor.fetchall()

    output_dir = './1.Raw'  # Adjusted output directory to 'data'
    os.makedirs(output_dir, exist_ok=True)

    for table in tables:
        table_name = table[0]
        if table_name == 'User_Accounts':
            query = f'SELECT employee_id,email,role FROM public."{table_name}"'
        else:
            query = f'SELECT * FROM public."{table_name}"'
        df = pd.read_sql(query, connection)

        df = df.astype(str)  # Convert all columns to string

        # Save each table as a separate CSV file
        csv_file = os.path.join(output_dir, f"{table_name}.csv")
        df.to_csv(csv_file, index=False)  # Saving as CSV
        print(f"Saved: {csv_file}")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the cursor and connection
    if cursor:
        cursor.close()
    if connection:
        connection.close()

print(f"Data has been saved to {output_dir} as separate CSV files.")


Saved: ./1.Raw\_prisma_migrations.csv
Saved: ./1.Raw\Employees.csv
Saved: ./1.Raw\Courses.csv
Saved: ./1.Raw\Designation_Courses.csv
Saved: ./1.Raw\Course_Performances.csv
Saved: ./1.Raw\Project_Performances.csv
Saved: ./1.Raw\Resignation_Records.csv
Saved: ./1.Raw\User_Accounts.csv
Data has been saved to ./1.Raw as separate CSV files.


  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)
  df = pd.read_sql(query, connection)


# Staging Layer

### Read the Data from Raw layer

In [10]:
import pandas as pd
input_dir = './1.Raw'

dataframes = []

course_performances = pd.read_csv(f'{input_dir}/Course_Performances.csv')
courses = pd.read_csv(f'{input_dir}/Courses.csv')
designation_courses = pd.read_csv(f'{input_dir}/Designation_Courses.csv')
employees = pd.read_csv(f'{input_dir}/Employees.csv')
project_performance = pd.read_csv(f'{input_dir}/Project_Performances.csv')
resignation_records = pd.read_csv(f'{input_dir}/Resignation_Records.csv')
user_account = pd.read_csv(f'{input_dir}/User_Accounts.csv')    

### Basic cleaning

In [None]:
import pandas as pd
import os

input_dir = './1.Raw'

# Load CSV files into DataFrames
course_performances = pd.read_csv(f'{input_dir}/Course_Performances.csv')
courses = pd.read_csv(f'{input_dir}/Courses.csv')
designation_courses = pd.read_csv(f'{input_dir}/Designation_Courses.csv')
employees = pd.read_csv(f'{input_dir}/Employees.csv')
project_performance = pd.read_csv(f'{input_dir}/Project_Performances.csv')
resignation_records = pd.read_csv(f'{input_dir}/Resignation_Records.csv')
user_account = pd.read_csv(f'{input_dir}/User_Accounts.csv')   

# Function to perform basic data cleaning
def clean_dataframe(df):
    # Check for missing values
    print("Missing values before cleaning:")
    print(df.isnull().sum())
    
    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Rename columns (optional, based on your needs)
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

    # Trim whitespaces from string columns
    string_cols = df.select_dtypes(include=[object]).columns
    df[string_cols] = df[string_cols].apply(lambda x: x.str.strip())

    # Check for missing values after cleaning
    print("Missing values after cleaning:")
    print(df.isnull().sum())
    
    return df

# Clean each DataFrame
course_performances_stage = clean_dataframe(course_performances)
courses_stage = clean_dataframe(courses)
designation_courses_stage = clean_dataframe(designation_courses)
employees_stage = clean_dataframe(employees)
project_performance_stage = clean_dataframe(project_performance)
resignation_records_stage = clean_dataframe(resignation_records)
user_account_stage = clean_dataframe(user_account)


### Cleaning Course Performance

In [27]:
course_performances_stage['score'] = course_performances_stage['score'].astype(int)
course_performances_stage['completion_date'] = pd.to_datetime(course_performances_stage['completion_date'], errors='coerce')

print(course_performances_stage.dtypes)

output_dir = './2.Staging'
os.makedirs(output_dir, exist_ok=True)

course_performances_stage.to_csv(f'{output_dir}/Course_Performances.csv', index=False)

print("Course Performances has been saved to CSV.")

id                          int64
employee_id                 int64
course_id                   int64
course_status              object
score                       int64
completion_date    datetime64[ns]
dtype: object
Cleaned DataFrame has been saved to CSV.


### Cleaning Course

In [28]:
courses_stage['duration_hours'] = courses_stage['duration_hours'].astype(int)
# courses_stage['completion_date'] = pd.to_datetime(courses_stage['completion_date'], errors='coerce')

print(courses_stage.dtypes)

output_dir = './2.Staging'
os.makedirs(output_dir, exist_ok=True)

courses_stage.to_csv(f'{output_dir}/Courses.csv', index=False)

print("Cleaned DataFrame has been saved to CSV.")

id                     int64
course_name           object
course_description    object
duration_hours         int64
dtype: object
Cleaned DataFrame has been saved to CSV.


### Cleaning Designation Courses

In [39]:
designation_courses_stage['course_id'] = designation_courses_stage['course_id'].astype(int)

print(designation_courses_stage.dtypes)

output_dir = './2.Staging'
os.makedirs(output_dir, exist_ok=True)

designation_courses_stage.to_csv(f'{output_dir}/Designation_Courses.csv', index=False)

print("Cleaned DataFrame has been saved to CSV.")

designation_type    object
course_id            int64
dtype: object
Cleaned DataFrame has been saved to CSV.


### Cleaning Employees

In [40]:
employees_stage['id'] = employees_stage['id'].astype(int)
employees_stage['hire_date'] = pd.to_datetime(employees_stage['hire_date'], errors='coerce')

print(employees_stage.dtypes)

output_dir = './2.Staging'
os.makedirs(output_dir, exist_ok=True)

employees_stage.to_csv(f'{output_dir}/Employees.csv', index=False)

print("Course Performances has been saved to CSV.")

id                            int64
first_name                   object
last_name                    object
department                   object
designation_type             object
hire_date            datetime64[ns]
employment_status            object
dtype: object
Course Performances has been saved to CSV.


### Cleaning Project Performance

In [41]:
project_performance_stage = project_performance_stage.astype(int) 

print(project_performance_stage.dtypes)

output_dir = './2.Staging'
os.makedirs(output_dir, exist_ok=True)

project_performance_stage.to_csv(f'{output_dir}/Project_Performances.csv', index=False)

print("Course Performances has been saved to CSV.")

id                           int64
employee_id                  int64
project_id                   int64
engagement_score             int64
teamwork_score               int64
punctuality_score            int64
overall_performance_score    int64
dtype: object
Course Performances has been saved to CSV.


### Cleaning Resignation Records

In [42]:
resignation_records_stage['id'] = resignation_records_stage['id'].astype(int)
resignation_records_stage['employee_id'] = resignation_records_stage['employee_id'].astype(int)
resignation_records_stage['resignation_date'] = pd.to_datetime(resignation_records_stage['resignation_date'], errors='coerce')

print(resignation_records_stage.dtypes)

output_dir = './2.Staging'
os.makedirs(output_dir, exist_ok=True)

resignation_records_stage.to_csv(f'{output_dir}/Resignation_Records.csv', index=False)

print("Course Performances has been saved to CSV.")

id                           int64
employee_id                  int64
resignation_date    datetime64[ns]
reason                      object
dtype: object
Course Performances has been saved to CSV.


### Cleaning User Accounts

In [None]:
course_performances_stage['score'] = course_performances_stage['score'].astype(int)
course_performances_stage['completion_date'] = pd.to_datetime(course_performances_stage['completion_date'], errors='coerce')

print(course_performances_stage.dtypes)

output_dir = './2.Staging'
os.makedirs(output_dir, exist_ok=True)

course_performances_stage.to_csv(f'{output_dir}/Course_Performances.csv', index=False)

print("Course Performances has been saved to CSV.")

In [38]:
temp = pd.read_csv(f'./2.Staging/Course_Performances.csv')

temp.info()
course_performances_stage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402 entries, 0 to 401
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               402 non-null    int64 
 1   employee_id      402 non-null    int64 
 2   course_id        402 non-null    int64 
 3   course_status    402 non-null    object
 4   score            402 non-null    int64 
 5   completion_date  402 non-null    object
dtypes: int64(4), object(2)
memory usage: 19.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402 entries, 0 to 401
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id               402 non-null    int64         
 1   employee_id      402 non-null    int64         
 2   course_id        402 non-null    int64         
 3   course_status    402 non-null    object        
 4   score            402 non-null    int64         
 5   compl