<a href="https://colab.research.google.com/github/Kasper-Raupach-Haurum/M6-Data-Engineering-and-MLOps/blob/main/M6_Data_Engineering_and_MLOps_Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
!git clone "https://github.com/Kasper-Raupach-Haurum/M6-Data-Engineering-and-MLOps.git"

fatal: destination path 'M6-Data-Engineering-and-MLOps' already exists and is not an empty directory.


In [88]:
!pip install mlflow --quiet
!pip install pyjokes --quiet
!mlflow

Usage: mlflow [OPTIONS] COMMAND [ARGS]...

Options:
  --version  Show the version and exit.
  --help     Show this message and exit.

Commands:
  artifacts    Upload, list, and download...
  db           Commands for managing an...
  deployments  Deploy MLflow models to custom...
  doctor       Prints out useful information for
               debugging issues with MLflow.
  experiments  Manage experiments.
  gc           Permanently delete runs in the
               `deleted` lifecycle stage.
  models       Deploy MLflow models locally.
  recipes      Run MLflow Recipes and inspect...
  run          Run an MLflow project from the...
  runs         Manage runs.
  sagemaker    Serve models on SageMaker.
  server       Run the MLflow tracking server.


In [89]:
# Import necessary libraries
import pyjokes
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn

In [78]:
# Set up MLflow experiment
mlflow.set_experiment('HR')

<Experiment: artifact_location='file:///content/mlruns/316513267277040799', creation_time=1683129930927, experiment_id='316513267277040799', last_update_time=1683129930927, lifecycle_stage='active', name='HR', tags={}>

In [79]:
# Function to load data
def load_data():
    hr_df = pd.read_csv('M6-Data-Engineering-and-MLOps/general_data.csv')
    es_df = pd.read_csv('M6-Data-Engineering-and-MLOps/employee_survey_data.csv')
    ms_df = pd.read_csv('M6-Data-Engineering-and-MLOps/manager_survey_data.csv')
    ems_df = pd.merge(es_df, ms_df)
    hr_df = pd.merge(hr_df, ems_df)
    return hr_df

In [80]:
# Function to preprocess data
def preprocess_data(hr_df):
    # Drop unnecessary columns
    hr_df.drop(['BusinessTravel', 'EmployeeCount', 'Department', 'EducationField', 'Gender',
                'JobRole', 'MaritalStatus', 'Over18', 'StandardHours'], axis=1, inplace=True)

    # Drop duplicates
    hr_df.drop_duplicates(subset=['Age', 'Attrition', 'DistanceFromHome', 'MonthlyIncome'], inplace=True)

    # Fill missing values
    hr_df['NumCompaniesWorked'].fillna(3, inplace=True)
    hr_df['EnvironmentSatisfaction'].fillna(3, inplace=True)
    hr_df['JobSatisfaction'].fillna(3, inplace=True)
    hr_df['WorkLifeBalance'].fillna(3, inplace=True)
    hr_df['TotalWorkingYears'].fillna(11, inplace=True)

    # Convert data types
    hr_df['Attrition'].replace({'No': 0, 'Yes': 1}, inplace=True)
    hr_df['NumCompaniesWorked'] = hr_df['NumCompaniesWorked'].astype(int)
    hr_df['TotalWorkingYears'] = hr_df['TotalWorkingYears'].astype(int)
    hr_df['EnvironmentSatisfaction'] = hr_df['EnvironmentSatisfaction'].astype(int)
    hr_df['JobSatisfaction'] = hr_df['JobSatisfaction'].astype(int)
    hr_df['WorkLifeBalance'] = hr_df['WorkLifeBalance'].astype(int)

    return hr_df

In [81]:
# Function to define evaluation metrics
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [82]:
# Function to train the model
def train_model(hr_df, alpha, l1_ratio):
    train_df, test_df = train_test_split(hr_df, test_size=0.2, random_state=42)

    train_x = train_df.drop(["Attrition"], axis=1)
    test_x = test_df.drop(["Attrition"], axis=1)
    train_y = train_df[["Attrition"]]
    test_y = test_df[["Attrition"]]

    with mlflow.start_run():
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha,l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(lr, "model")

In [90]:
# Main function to execute the script
def main():
    hr_df = load_data()
    hr_df = preprocess_data(hr_df)
    
    # Print the shape of hr_df
    hrdf = hr_df[['Age', 'Attrition', 'DistanceFromHome', 'MonthlyIncome']]
    print(f"Shape of hrdf: {hrdf.shape}")
    
    # Print a random generated joke
    joke = pyjokes.get_joke()
    print(f"Random joke of the day: {joke}")
    
    # Train the model with different hyperparameters
    train_model(hr_df, 0.5, 0.5)
    train_model(hr_df, 0.5, 0.4)
    train_model(hr_df, 0.5, 0.3)
    train_model(hr_df, 0.5, 0.2)
    train_model(hr_df, 0.5, 0.1)

if __name__ == "__main__":
    main()

Shape of hrdf: (1470, 4)
Random joke: Waiter: He's choking! Is anyone a doctor? Programmer: I'm a Vim user.
Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 0.380037905409416
  MAE: 0.2751799142806001
  R2: 0.02263770593526393
Elasticnet model (alpha=0.500000, l1_ratio=0.400000):
  RMSE: 0.3792424164733617
  MAE: 0.274226562329349
  R2: 0.026725020255458976
Elasticnet model (alpha=0.500000, l1_ratio=0.300000):
  RMSE: 0.378547795751557
  MAE: 0.27327413810999185
  R2: 0.030287057656103844
Elasticnet model (alpha=0.500000, l1_ratio=0.200000):
  RMSE: 0.3772845189861044
  MAE: 0.27017114667116326
  R2: 0.036748444051532325
Elasticnet model (alpha=0.500000, l1_ratio=0.100000):
  RMSE: 0.3773145686987234
  MAE: 0.26848989069241347
  R2: 0.03659499707084324
