In [1]:
import pandas as pd
import numpy as np
import mlflow 
from pathlib import Path
from typing import Tuple

In [5]:
# load processed train and dev data for modelling
def load_data(filepath: str, dataset_type: str) -> pd.DataFrame:
    '''
        Loads processed data from csv source
    
        This function loads processed data from csv file
        that will be used for modelling
        
        Args:
            filepath: path to processed dataset
            dataset_type: whether the dataset is train, dev or test dataframe
            
        Returns:
            pd.DataFrame: returns a dataframe containing processed data
            
        Examples:
            >>> df = load_data('data/processed/train_set.csv')
                df.head()
    '''
    filename = Path(filepath)
    if not filename.exists():
        raise FileNotFoundError(f'File not found! Check filepath and try again later!')
    
    df = pd.read_csv(filename)

    # check that the df is not empty
    if len(df) == 0:
        raise ValueError(f'Dataframe cannot be empty!')
    
    print(f'{dataset_type} dataframe successfully loaded with {df.shape[0]} rows and {df.shape[1]} features')
    
    return df


TRAIN_DATA_PATH = '../data/processed/train_set.csv'
DEV_DATA_PATH = '../data/processed/dev_set.csv'

dev_df = load_data(DEV_DATA_PATH, 'Dev')
train_df = load_data(TRAIN_DATA_PATH, 'Train')



Dev dataframe successfully loaded with 7500 rows and 45 features
Train dataframe successfully loaded with 35000 rows and 44 features


In [4]:
missing_cols = [col for col in dev_df if col not in train_df]
missing_cols

['Unnamed: 0']

In [10]:
# perform one last data quality check before modelling
def data_quality_checks(df: pd.DataFrame) -> None:
    '''
        Performs one last data quality check before modelling

        Args:
            df: pandas' dataframe to be validated
        
        Raises:
            ValueError: if any of the validations fail

        Example:
            >>> data_quality_checks(df)
    '''

    missing = df.isnull().sum()
    if not len(missing) == 0:
        raise ValueError("The dataset must not contain null values")
    
    n_duplicates = df[df.duplicated()].sum()
    if n_duplicates != 0:
        raise ValueError("The dataset must not contain duplicate rows")

    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if len(numerical_cols) != len(df.columns):
        raise ValueError("All columns in the dataset must be numerical")

    print("All validation passed!")

data_quality_checks(train_df)
data_quality_checks(dev_df)

ValueError: The dataset must not contain null values

In [11]:
dev_df.isnull().sum()

Unnamed: 0                     0
Employee_age                   0
Current_Salary                 0
Number_of_Children             0
years_experience               0
past_projects                  0
current_projects               0
performance_rating             0
Job_Satisfaction               0
Work_Life_Balance              0
is_outlier                     0
Maritial_Status_True           0
Divorced_earlier_Yes           0
Father_alive_Yes               0
Mother_alive_Yes               0
Education_level_Diploma        0
Education_level_High School    0
Education_level_Master's       0
Education_level_PhD            0
Department_Finance             0
Department_HR                  0
Department_Operations          0
Department_R&D                 0
Department_Sales               0
Department_Support             0
Role_Analyst                   0
Role_Customer Specialist       0
Role_DevOps Engineer           0
Role_HR Executive              0
Role_HR Manager                0
Role_ML En

In [None]:
# split the data further into features and target set
def features_target_split(df: pd.DataFrame, target: str = 'Current_Salary') -> Tuple[pd.DataFrame, pd.DataFrame]:
    '''
        Split the dataset further into features and target splits
        
        This function takes the given dataframe and splits it
        into the feature set and target set for modelling purposes
    
        Args:
            df: pd.DataFrame = Provided dataframe
            target: str = Target variable in the given dataframe

        Returns:
            A tuple of the features and target sets

        Examples:
            >>> x, y = features_target_split(df, 'Employee_salary')
                x.head()
                y.head()
    '''
    y = df[target].copy()
    if not isinstance(y, pd.Series):
        raise ValueError('The target variable must be a pandas series')
    
    if len(y) == 0:
        raise ValueError(f'The target variable cannot be empty')
    
    x = df.drop(columns=[target]).copy()
    if not isinstance(x, pd.DataFrame):
        raise ValueError('The feature set must be a pandas dataframe')
    
    if len(x) == 0:
        raise ValueError(f'The target variable cannot be empty')
    
    if len(y) != len(x):
        raise ValueError('The length of the target variable must be equal to the length of the feature set')

    
    return x, y

x_train, y_train = features_target_split(train_df)
x_dev, y_dev = features_target_split(dev_df)