<a href="https://colab.research.google.com/github/Maxime-Bakunzi/database_design_plg_2/blob/aadumbuya/Prediction_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Employee / HR Dataset

The Synthetic Employee Records Dataset is a simulated dataset created for the purpose of exploring various data analysis and machine learning techniques in the context of human resources and employee management. This synthetic dataset mirrors the structure and characteristics of real employee data, while all the information contained within is entirely fictional and generated for illustrative purposes.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
import joblib

# Loading train and test dataset

In [4]:
# Load dataset
data = pd.read_csv('/content/drive/MyDrive/employee_data.csv')

In [5]:
# prompt: print column in the data

data.columns

Index(['EmpID', 'FirstName', 'LastName', 'StartDate', 'ExitDate', 'Title',
       'Supervisor', 'ADEmail', 'BusinessUnit', 'EmployeeStatus',
       'EmployeeType', 'PayZone', 'EmployeeClassificationType',
       'TerminationType', 'TerminationDescription', 'DepartmentType',
       'Division', 'DOB', 'State', 'JobFunctionDescription', 'GenderCode',
       'LocationCode', 'RaceDesc', 'MaritalDesc', 'Performance Score',
       'Current Employee Rating'],
      dtype='object')

In [6]:
data.columns = data.columns.str.lower()
data.head()

Unnamed: 0,empid,firstname,lastname,startdate,exitdate,title,supervisor,ademail,businessunit,employeestatus,...,division,dob,state,jobfunctiondescription,gendercode,locationcode,racedesc,maritaldesc,performance score,current employee rating
0,3427,Uriah,Bridges,20-Sep-19,,Production Technician I,Peter Oneill,uriah.bridges@bilearner.com,CCDR,Active,...,Finance & Accounting,07-10-1969,MA,Accounting,Female,34904,White,Widowed,Fully Meets,4
1,3428,Paula,Small,11-Feb-23,,Production Technician I,Renee Mccormick,paula.small@bilearner.com,EW,Active,...,Aerial,30-08-1965,MA,Labor,Male,6593,Hispanic,Widowed,Fully Meets,3
2,3429,Edward,Buck,10-Dec-18,,Area Sales Manager,Crystal Walker,edward.buck@bilearner.com,PL,Active,...,General - Sga,06-10-1991,MA,Assistant,Male,2330,Hispanic,Widowed,Fully Meets,4
3,3430,Michael,Riordan,21-Jun-21,,Area Sales Manager,Rebekah Wright,michael.riordan@bilearner.com,CCDR,Active,...,Finance & Accounting,04-04-1998,ND,Clerk,Male,58782,Other,Single,Fully Meets,2
4,3431,Jasmine,Onque,29-Jun-19,,Area Sales Manager,Jason Kim,jasmine.onque@bilearner.com,TNS,Active,...,General - Con,29-08-1969,FL,Laborer,Female,33174,Other,Married,Fully Meets,3


In [7]:
data.columns

Index(['empid', 'firstname', 'lastname', 'startdate', 'exitdate', 'title',
       'supervisor', 'ademail', 'businessunit', 'employeestatus',
       'employeetype', 'payzone', 'employeeclassificationtype',
       'terminationtype', 'terminationdescription', 'departmenttype',
       'division', 'dob', 'state', 'jobfunctiondescription', 'gendercode',
       'locationcode', 'racedesc', 'maritaldesc', 'performance score',
       'current employee rating'],
      dtype='object')

In [8]:
# Rename columns to match training features
data = data.rename(columns={
    'current employee rating': 'currentemployeerating',
    'performance score' : 'performancescore'
})
data.columns

Index(['empid', 'firstname', 'lastname', 'startdate', 'exitdate', 'title',
       'supervisor', 'ademail', 'businessunit', 'employeestatus',
       'employeetype', 'payzone', 'employeeclassificationtype',
       'terminationtype', 'terminationdescription', 'departmenttype',
       'division', 'dob', 'state', 'jobfunctiondescription', 'gendercode',
       'locationcode', 'racedesc', 'maritaldesc', 'performancescore',
       'currentemployeerating'],
      dtype='object')

In [9]:
# Selection fearures and target for prediction
# Assuming 'Performance Score' as the target varaible for prediction
features = ["gendercode", "employeestatus", "employeetype", "maritaldesc"]

target = "currentemployeerating"

In [10]:
# Preprocess data: Label encoding categorical variables
label_encoders = {}
for col in features + [target]:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

In [12]:
# Standardize features for linear regression and neural network
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ----- Linear Regression Model -----

In [13]:
# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [14]:
# Evaluate the Linear Regression model
y_pred_lr = linear_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_lr)
print(f"Linear Regression Mean Squared Error: {mse:.2f}")

Linear Regression Mean Squared Error: 1.09


In [15]:
# Save the linear regression model and scaler
joblib.dump(linear_model, "employee_performance_linear_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

['label_encoders.pkl']

# ----- Neural Network Model -----

In [16]:
# Initialize and train the Neural Network (MLP) model
nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
nn_model.fit(X_train, y_train)

In [17]:
# Evaluate the Neural Network model
y_pred_nn = nn_model.predict(X_test)
nn_accuracy = accuracy_score(y_test, y_pred_nn)
print(f"Neural Network Model Accuracy: {nn_accuracy:.2f}")

Neural Network Model Accuracy: 0.52


In [18]:
# Save the neural network model
joblib.dump(nn_model, "employee_performance_nn_model.pkl")

['employee_performance_nn_model.pkl']

# Fetch Data From API

In [19]:
import requests
import numpy as np

In [20]:
# Load the trained model
model = joblib.load("employee_performance_linear_model.pkl")

# Load the label encoders for categorical features
label_encoders = joblib.load("label_encoders.pkl")

In [21]:
# Example API URL
api_url = "https://employees-api-endpoint.onrender.com/employees/latest"

In [22]:
# Fetch the latest employee data
def fetch_latest_entry(api_url):
    response = requests.get(api_url)
    if response.status_code == 200:
        latest_data = response.json()
        return latest_data
    else:
        print("Failed to fetch data:", response.status_code)
        return None

In [23]:

def preprocess_data(data, features):
    for feature in features:
        if feature in label_encoders:
            encoder = label_encoders[feature]
            # Handle unknown labels by adding them to the encoder's classes
            if data[feature] not in encoder.classes_:
                # Instead of adding new class, assign unknown to a common class
                data[feature] = encoder.classes_[0] # Assign to the first class
                # or
                # data[feature] = 'Unknown'  # Or assign to a specific unknown label
            data[feature] = encoder.transform([data[feature]])[0]  # Convert to encoded value
        else:
            print(f"Warning: Encoder not found for {feature}")
    return data

In [24]:
# Specify the API URL for fetching the latest entry
API_URL = "https://employees-api-endpoint.onrender.com/employees/latest"

# Define the features used in the model
features = ["gendercode", "employeestatus", "employeetype", "maritaldesc"]

# Fetch the latest entry
latest_entry = fetch_latest_entry(API_URL)

if latest_entry:
    # Preprocess data
    processed_data = preprocess_data(latest_entry, features)

    # Convert the dictionary into a DataFrame (matching model input format)
    # Only include the features used during training
    input_df = pd.DataFrame([processed_data])[features] # Changed line

    # Predict using the trained model
    prediction = model.predict(input_df)
    # Output the result
    print(f"Predicted Current Employee Rating: {prediction[0]}")
else:
    print("No data available to make a prediction.")


Predicted Current Employee Rating: 2.074165235813848




In [25]:
input_df.head()

Unnamed: 0,gendercode,employeestatus,employeetype,maritaldesc
0,1,3,0,0


In [26]:
df_latest = pd.DataFrame(latest_entry, index=[0])
df_latest.head()

Unnamed: 0,firstname,lastname,dob,gendercode,racedesc,maritaldesc,employeestatus,employeetype,currentemployeerating,empid
0,Angelique,Mann,1960-03-31,1,Black,0,3,0,3,4000
