In [1]:
import pandas as pd
import numpy as np
import mlflow 
from pathlib import Path
from typing import Tuple

In [2]:
# load processed train and dev data for modelling
def load_data(filepath: str, dataset_type: str) -> pd.DataFrame:
    '''
        Loads processed data from csv source
    
        This function loads processed data from csv file
        that will be used for modelling
        
        Args:
            filepath: path to processed dataset
            dataset_type: whether the dataset is train, dev or test dataframe
            
        Returns:
            pd.DataFrame: returns a dataframe containing processed data
            
        Examples:
            >>> df = load_data('data/processed/train_set.csv')
                df.head()
    '''
    filename = Path(filepath)
    if not filename.exists():
        raise FileNotFoundError(f'File not found! Check filepath and try again later!')
    
    df = pd.read_csv(filename)

    # check that the df is not empty
    if len(df) == 0:
        raise ValueError(f'Dataframe cannot be empty!')
    
    print(f'{dataset_type} dataframe successfully loaded with {df.shape[0]} rows and {df.shape[1]} features')
    
    return df


TRAIN_DATA_PATH = '../data/processed/train_set.csv'
DEV_DATA_PATH = '../data/processed/dev_set.csv'

dev_df = load_data(DEV_DATA_PATH, 'Dev')
train_df = load_data(TRAIN_DATA_PATH, 'Train')



Dev dataframe successfully loaded with 7500 rows and 38 features
Train dataframe successfully loaded with 35000 rows and 38 features


In [3]:
# perform one last data quality check before modelling
def data_quality_checks(df: pd.DataFrame) -> None:
    '''
        Performs one last data quality check before modelling

        Args:
            df: pandas' dataframe to be validated
        
        Raises:
            ValueError: if any of the validations fail

        Example:
            >>> data_quality_checks(df)
    '''

    missing = df.isnull().sum().sum()
    if missing:
        raise ValueError("The dataset must not contain null values")
    
    n_duplicates = df.duplicated().sum()
    if n_duplicates != 0:
        raise ValueError("The dataset must not contain duplicate rows")

    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if len(numerical_cols) != len(df.columns):
        raise ValueError("All columns in the dataset must be numerical")

    print("All validation passed!")

data_quality_checks(train_df)
data_quality_checks(dev_df)

All validation passed!
All validation passed!


In [5]:
# split the data further into features and target set
def features_target_split(df: pd.DataFrame, target: str = 'Current_Salary_log') -> Tuple[pd.DataFrame, pd.DataFrame]:
    '''
        Split the dataset further into features and target splits
        
        This function takes the given dataframe and splits it
        into the feature set and target set for modelling purposes
    
        Args:
            df: pd.DataFrame = Provided dataframe
            target: str = Target variable in the given dataframe

        Returns:
            A tuple of the features and target sets

        Examples:
            >>> x, y = features_target_split(df, 'Employee_salary')
                x.head()
                y.head()
    '''
    y = df[target].copy()
    if not isinstance(y, pd.Series):
        raise ValueError('The target variable must be a pandas series')
    
    if len(y) == 0:
        raise ValueError(f'The target variable cannot be empty')
    
    x = df.drop(columns=[target]).copy()
    if not isinstance(x, pd.DataFrame):
        raise ValueError('The feature set must be a pandas dataframe')
    
    if len(x) == 0:
        raise ValueError(f'The target variable cannot be empty')
    
    if len(y) != len(x):
        raise ValueError('The length of the target variable must be equal to the length of the feature set')

    
    return x, y

x_train, y_train = features_target_split(train_df)
x_dev, y_dev = features_target_split(dev_df)

In [6]:
print('x_train: ', x_train.shape)
print(f'x_dev: {x_dev.shape}')
print(f'y_train: {y_train.shape}')
print(f'y_dev: {y_dev.shape}')

x_train:  (35000, 37)
x_dev: (7500, 37)
y_train: (35000,)
y_dev: (7500,)


In [7]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, learning_curve
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error,mean_absolute_error,root_mean_squared_error,r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [8]:
# baseline models - performing modelling with minimal feature engineering
models = {
    'Ridge' : Ridge(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=200, max_depth=6, n_jobs=-1, random_state=1),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(random_state=2)
}


In [11]:

for model_name, model in models.items():
    print(f'Training model: {model_name}...')
    start_time = time.time()
    model.fit(x_train, y_train)

    time_elapsed = time.time() - start_time

    preds_train = model.predict(x_train)
    # preds_dev = model.predict(x_dev)

    results = {
        'Model_name' : model_name,
        'Mean_squared_error' : mean_squared_error(y_train, preds_train),
        'Root_mean_squared_error' : root_mean_squared_error(y_train, preds_train),
        'Mean_absolute_error' : mean_absolute_error(y_train, preds_train),
        'R^2_score' : r2_score(y_train, preds_train),
        'Time_elapsed' : time_elapsed
    }
    
    for name, result in results.items():
        print(f'{name}_train : {result}')
        
    print('='*50)

Training model: Ridge...
Model_name_train : Ridge
Mean_squared_error_train : 0.16283124347425945
Root_mean_squared_error_train : 0.40352353521728995
Mean_absolute_error_train : 0.32872611281472097
R^2_score_train : 0.17412199234032788
Time_elapsed_train : 0.08465051651000977
Training model: RandomForest...
Model_name_train : RandomForest
Mean_squared_error_train : 0.15528384174018034
Root_mean_squared_error_train : 0.3940607081912384
Mean_absolute_error_train : 0.3210497567790834
R^2_score_train : 0.21240231848752567
Time_elapsed_train : 7.352097272872925
Training model: XGBoost...
Model_name_train : XGBoost
Mean_squared_error_train : 0.11535355970326377
Root_mean_squared_error_train : 0.339637394441872
Mean_absolute_error_train : 0.2736790933601069
R^2_score_train : 0.4149282040013249
Time_elapsed_train : 0.5313794612884521
Training model: LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002118 seconds.
You can set `force_row_wise=tru

In [10]:
# cross validation score 
cv = KFold(n_splits=5, shuffle=True, random_state=9999)

for name, model in models.items():
    cross_val = cross_val_score(model, x_train, y_train, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=-1)

    cv_score = -cross_val
    results = {
        'model_name' : name,
        'cv_score_across_fold' : cv_score,
        'cv_score_mean' : cv_score.mean(),
        'cv_score_std' : cv_score.std()
    }

    for name, result in results.items():
        print(f'{name}_train : {result}')

    print('='*50)

model_name_train : Ridge
cv_score_across_fold_train : [0.39983576 0.40352756 0.4079302  0.39962017 0.40844163]
cv_score_mean_train : 0.4038710646820033
cv_score_std_train : 0.003790474777318751
model_name_train : RandomForest
cv_score_across_fold_train : [0.39590508 0.39885779 0.40323224 0.39622974 0.40148317]
cv_score_mean_train : 0.39914160312663644
cv_score_std_train : 0.0028723109580029048
model_name_train : XGBoost
cv_score_across_fold_train : [0.41266127 0.41454421 0.41647405 0.40909424 0.41444227]
cv_score_mean_train : 0.41344321057272293
cv_score_std_train : 0.002486816634741162
model_name_train : LightGBM
cv_score_across_fold_train : [0.39739636 0.40098042 0.4046927  0.39749862 0.40325354]
cv_score_mean_train : 0.4007643259620644
cv_score_std_train : 0.002955773327812163
