In [1]:
!git clone "https://github.com/Kasper-Raupach-Haurum/M6-Data-Engineering-and-MLOps.git"

fatal: destination path 'M6-Data-Engineering-and-MLOps' already exists and is not an empty directory.


In [2]:
#Importing Necessary
%pip install mlflow --quiet
!mlflow
import os
import shutil
import pprint
import sys
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np

from random import random, randint
import mlflow
import mlflow.sklearn

# Set up MLflow experiment
mlflow.set_experiment('HR')


#Libraries for MLflow
from mlflow import log_metric, log_param, log_artifacts
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlflow.tracking import MlflowClient
import warnings

#Setting Up Warning
warnings.filterwarnings("ignore")
print(mlflow.__version__)

Note: you may need to restart the kernel to use updated packages.
Usage: mlflow [OPTIONS] COMMAND [ARGS]...

Options:
  --version  Show the version and exit.
  --help     Show this message and exit.

Commands:
  artifacts    Upload, list, and download artifacts from an MLflow...
  db           Commands for managing an MLflow tracking database.
  deployments  Deploy MLflow models to custom targets.
  doctor       Prints out useful information for debugging issues with MLflow.
  experiments  Manage experiments.
  gc           Permanently delete runs in the `deleted` lifecycle stage.
  models       Deploy MLflow models locally.
  recipes      Run MLflow Recipes and inspect recipe results.
  run          Run an MLflow project from the given URI.
  runs         Manage runs.
  sagemaker    Serve models on SageMaker.
  server       Run the MLflow tracking server.
2.3.0


In [3]:
# Function to load data

path = 'M6-Data-Engineering-and-MLOps\general_data.csv'
HR = pd.read_csv(path)
path1 = 'M6-Data-Engineering-and-MLOps\employee_survey_data.csv'
ES = pd.read_csv(path1)
path2 = 'M6-Data-Engineering-and-MLOps\manager_survey_data.csv'
MS = pd.read_csv(path2)
EMS = pd.merge(ES, MS)
hr_df = pd.merge(HR, EMS)

In [4]:
# Function to preprocess data

# Drop duplicates
hrdf = hr_df.drop_duplicates(subset = ['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'])

# Fill missing values
hrdf.NumCompaniesWorked = hrdf.NumCompaniesWorked.fillna(3)
hrdf.EnvironmentSatisfaction = hrdf.EnvironmentSatisfaction.fillna(3)
hrdf.JobSatisfaction = hrdf.JobSatisfaction.fillna(3)
hrdf.WorkLifeBalance = hrdf.WorkLifeBalance.fillna(3)
hrdf.TotalWorkingYears = hrdf.TotalWorkingYears.fillna(11)

# Convert data types
hrdf['NumCompaniesWorked'] = hrdf['NumCompaniesWorked'].astype(int)
hrdf['TotalWorkingYears'] = hrdf['TotalWorkingYears'].astype(int)
hrdf['EnvironmentSatisfaction'] = hrdf['EnvironmentSatisfaction'].astype(int)
hrdf['JobSatisfaction'] = hrdf['JobSatisfaction'].astype(int)
hrdf['WorkLifeBalance'] = hrdf['WorkLifeBalance'].astype(int)

# Replacing Yes with 1 and No with 0
hrdf['Attrition'].replace({'No' : '0'}, inplace=True)
hrdf['Attrition'].replace({'Yes' : '1'}, inplace=True)

hrdf['NumCompaniesWorked'] = hrdf['NumCompaniesWorked'].astype(int)
hrdf['TotalWorkingYears'] = hrdf['TotalWorkingYears'].astype(int)
hrdf['EnvironmentSatisfaction'] = hrdf['EnvironmentSatisfaction'].astype(int)
hrdf['JobSatisfaction'] = hrdf['JobSatisfaction'].astype(int)
hrdf['WorkLifeBalance'] = hrdf['WorkLifeBalance'].astype(int)
hrdf['Attrition'] = hrdf['Attrition'].astype(int)


# Dropping Columns
hrdf.drop(['BusinessTravel', 'EmployeeCount', 'Department',
            'StandardHours', 'Over18'], axis=1, inplace=True)
print('Size of Full dataset is: {}'.format(hrdf.shape))

hrdf.drop(['EducationField', 'Gender',
            'JobRole', 'MaritalStatus'], axis=1, inplace=True)
print('Size of Full dataset is: {}'.format(hrdf.shape))

hrdf.drop(['Education', 'EmployeeID', 'JobLevel',
            'NumCompaniesWorked', 'StockOptionLevel'], axis=1, inplace=True)
print('Size of Full dataset is: {}'.format(hrdf.shape))

hrdf.drop(['TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
            'YearsWithCurrManager', 'EnvironmentSatisfaction'], axis=1, inplace=True)
print('Size of Full dataset is: {}'.format(hrdf.shape))

hrdf.drop(['JobSatisfaction', 'WorkLifeBalance',
            'JobInvolvement', 'PerformanceRating'], axis=1, inplace=True)
print('Size of Full dataset is: {}'.format(hrdf.shape))

Size of Full dataset is: (1498, 24)
Size of Full dataset is: (1498, 20)
Size of Full dataset is: (1498, 15)
Size of Full dataset is: (1498, 10)
Size of Full dataset is: (1498, 6)


In [5]:
hrdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1498 entries, 0 to 4409
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Age                1498 non-null   int64
 1   Attrition          1498 non-null   int32
 2   DistanceFromHome   1498 non-null   int64
 3   MonthlyIncome      1498 non-null   int64
 4   PercentSalaryHike  1498 non-null   int64
 5   TotalWorkingYears  1498 non-null   int32
dtypes: int32(2), int64(4)
memory usage: 70.2 KB


In [6]:
# Function to define evaluation metrics
def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2

In [7]:
# Function to train the model
def train_model(alpha, l1_ratio):
    train, test = train_test_split(hrdf)
    train_x = train.drop(["Attrition"], axis=1)
    test_x = test.drop(["Attrition"], axis=1)
    train_y = train[["Attrition"]]
    test_y = test[["Attrition"]]

    with mlflow.start_run():
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(lr, "model")

In [8]:
# Train the model with different hyperparameters

train_model(0.5, 0.5)

Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 0.36079983110579833
  MAE: 0.26414202337493387
  R2: 0.03142471632460864


In [9]:
train_model(0.5, 0.4)

Elasticnet model (alpha=0.500000, l1_ratio=0.400000):
  RMSE: 0.3726638836745962
  MAE: 0.2710243724104161
  R2: 0.018801707760612962


In [10]:
train_model(0.5, 0.3)

Elasticnet model (alpha=0.500000, l1_ratio=0.300000):
  RMSE: 0.3705742351211006
  MAE: 0.2679366355141761
  R2: 0.029774657716179598


In [11]:
train_model(0.5, 0.2)

Elasticnet model (alpha=0.500000, l1_ratio=0.200000):
  RMSE: 0.36137716370784356
  MAE: 0.2590179179874232
  R2: 0.02832251153626819


In [12]:
train_model(0.5, 0.1)

Elasticnet model (alpha=0.500000, l1_ratio=0.100000):
  RMSE: 0.38051298414263107
  MAE: 0.2758830598011126
  R2: 0.02466410298445887
