# Employee Performance 

In [2]:
import zipfile
import pandas as pd

# Define the path to your zip file
zip_file_path = 'Employee Performance.zip'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('extracted_data')

# Load the CSV files into pandas dataframes
train_df = pd.read_csv('extracted_data/train_dataset.csv')
test_df = pd.read_csv('extracted_data/test_dataset.csv')

# Check the first few rows of each dataframe to confirm the data is loaded correctly
print("Train Dataset Preview:")
print(train_df.head())

print("\nTest Dataset Preview:")
print(test_df.head())

Train Dataset Preview:
   team  targeted_productivity    smv    wip  over_time  incentive  idle_time  \
0     9                   0.75   3.94    NaN        960          0        0.0   
1     7                   0.65  30.10  909.0       7080          0        0.0   
2     3                   0.80   4.15    NaN       1440          0        0.0   
3     1                   0.65  22.53  762.0       5040          0        0.0   
4     4                   0.70  30.10  767.0       3300         50        0.0   

   idle_men  no_of_style_change  no_of_workers  ...  department_finishing  \
0         0                   0            8.0  ...                     1   
1         0                   1           59.0  ...                     0   
2         0                   0            7.0  ...                     0   
3         0                   1           42.0  ...                     0   
4         0                   1           57.0  ...                     0   

   department_finishing   d

In [4]:
# Check basic information for the training dataset
print("Training Dataset Info:")
print(train_df.info())
print("\nTraining Dataset Description:")
print(train_df.describe())
print("\nTraining Dataset Preview:")
print(train_df.head())
print("\nMissing Values in Training Dataset:")
print(train_df.isnull().sum())

print("\n" + "="*50 + "\n")  # Separator for clarity

# Check basic information for the testing dataset
print("Testing Dataset Info:")
print(test_df.info())
print("\nTesting Dataset Description:")
print(test_df.describe())
print("\nTesting Dataset Preview:")
print(test_df.head())
print("\nMissing Values in Testing Dataset:")
print(test_df.isnull().sum())

Training Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017 entries, 0 to 1016
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   team                   1017 non-null   int64  
 1   targeted_productivity  1017 non-null   float64
 2   smv                    1017 non-null   float64
 3   wip                    594 non-null    float64
 4   over_time              1017 non-null   int64  
 5   incentive              1017 non-null   int64  
 6   idle_time              1017 non-null   float64
 7   idle_men               1017 non-null   int64  
 8   no_of_style_change     1017 non-null   int64  
 9   no_of_workers          1017 non-null   float64
 10  month                  1017 non-null   int64  
 11  quarter_Quarter1       1017 non-null   int64  
 12  quarter_Quarter2       1017 non-null   int64  
 13  quarter_Quarter3       1017 non-null   int64  
 14  quarter_Quarter4       1017 non-n

In [None]:
# Data Cleaning and Preprocessing

In [6]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Removing duplicated columns
train_df = train_df.loc[:, ~train_df.columns.duplicated()]
test_df = test_df.loc[:, ~test_df.columns.duplicated()]

# Impute missing values for 'wip' using median (since it contains numeric values)
imputer = SimpleImputer(strategy='median')
train_df['wip'] = imputer.fit_transform(train_df[['wip']])
test_df['wip'] = imputer.transform(test_df[['wip']])

# Checking for other missing values (since only 'wip' had missing values based on your output, no need for further imputation)

# Encode categorical variables (if applicable)
# Your categorical columns are already one-hot encoded, so no need for further encoding

# Feature Scaling for continuous variables (over_time, incentive, idle_time, etc.)
scaler = StandardScaler()

# Identify continuous columns that need scaling
continuous_columns = ['over_time', 'incentive', 'idle_time', 'wip', 'no_of_workers']

# Fit the scaler on training data and transform both training and testing datasets
train_df[continuous_columns] = scaler.fit_transform(train_df[continuous_columns])
test_df[continuous_columns] = scaler.transform(test_df[continuous_columns])

# Check the cleaned data
print("Training Data after Cleaning:")
print(train_df.head())

print("\nTesting Data after Cleaning:")
print(test_df.head())

Training Data after Cleaning:
   team  targeted_productivity    smv       wip  over_time  incentive  \
0     9                   0.75   3.94 -0.061397  -1.091179  -0.234987   
1     7                   0.65  30.10 -0.156176   0.777874  -0.234987   
2     3                   0.80   4.15 -0.061397  -0.944586  -0.234987   
3     1                   0.65  22.53 -0.263348   0.154856  -0.234987   
4     4                   0.70  30.10 -0.259703  -0.376541   0.053771   

   idle_time  idle_men  no_of_style_change  no_of_workers  ...  \
0  -0.055944         0                   0      -1.210682  ...   
1  -0.055944         0                   1       1.089270  ...   
2  -0.055944         0                   0      -1.255779  ...   
3  -0.055944         0                   1       0.322619  ...   
4  -0.055944         0                   1       0.999076  ...   

   department_finishing  department_finishing   department_sweing  day_Monday  \
0                     1                      0       

In [None]:
# Feature Engineering

In [10]:
# Create new features in the training data
train_df['cumulative_productivity'] = train_df['actual_productivity'].cumsum()
train_df['rolling_productivity_mean'] = train_df['actual_productivity'].rolling(window=3, min_periods=1).mean()

# Since 'actual_productivity' is not present in the test dataset (because it's the target), we skip these steps for test data

# Work Condition Index (applies to both training and test data)
work_condition_columns = ['wip', 'over_time', 'incentive']
train_df['work_condition_index'] = train_df[work_condition_columns].mean(axis=1)
test_df['work_condition_index'] = test_df[work_condition_columns].mean(axis=1)

# Inspect the new features
print("New Features in Training Data:")
print(train_df[['cumulative_productivity', 'rolling_productivity_mean', 'work_condition_index']].head())

print("\nNew Features in Testing Data:")
print(test_df[['work_condition_index']].head())

New Features in Training Data:
   cumulative_productivity  rolling_productivity_mean  work_condition_index
0                 0.755167                   0.755167             -0.462521
1                 1.290845                   0.645422              0.128904
2                 2.111678                   0.703893             -0.413657
3                 2.692809                   0.645881             -0.114493
4                 3.482812                   0.730656             -0.194158

New Features in Testing Data:
   work_condition_index
0             -0.450305
1             -0.315929
2             -0.478873
3             -0.340361
4             -0.413657


In [None]:
# Data Splitting

In [12]:
from sklearn.model_selection import train_test_split

# Features (X) and target (y)
X = train_df.drop(columns=['actual_productivity'])  # Drop the target variable from the feature set
y = train_df['actual_productivity']  # This is the target variable

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the split data
print("Training Set Shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Validation Set Shape (X_val, y_val):", X_val.shape, y_val.shape)

Training Set Shape (X_train, y_train): (813, 28) (813,)
Validation Set Shape (X_val, y_val): (204, 28) (204,)


In [None]:
# Model Selection

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Function to train a model and evaluate its performance
def train_and_evaluate(model, X_train, y_train, X_val, y_val):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the validation set
    y_pred = model.predict(X_val)
    
    # Evaluate the model
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    print(f"Model: {model.__class__.__name__}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-Squared (R²): {r2:.4f}")
    print("="*40)

# 1. Linear Regression
lr_model = LinearRegression()
train_and_evaluate(lr_model, X_train, y_train, X_val, y_val)

# 2. Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
train_and_evaluate(rf_model, X_train, y_train, X_val, y_val)

# 3. Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(random_state=42)
train_and_evaluate(gbr_model, X_train, y_train, X_val, y_val)

Model: LinearRegression
Mean Absolute Error (MAE): 0.1108
Mean Squared Error (MSE): 0.0221
R-Squared (R²): 0.3349
Model: RandomForestRegressor
Mean Absolute Error (MAE): 0.0840
Mean Squared Error (MSE): 0.0165
R-Squared (R²): 0.5027
Model: GradientBoostingRegressor
Mean Absolute Error (MAE): 0.0865
Mean Squared Error (MSE): 0.0168
R-Squared (R²): 0.4938


In [None]:
# Hyperparameter Tuning

In [16]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for tuning Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearch to the data
grid_search.fit(X_train, y_train)

# Best parameters from the search
print("Best parameters:", grid_search.best_params_)

# Evaluate the best model
best_rf_model = grid_search.best_estimator_
train_and_evaluate(best_rf_model, X_train, y_train, X_val, y_val)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Model: RandomForestRegressor
Mean Absolute Error (MAE): 0.0842
Mean Squared Error (MSE): 0.0167
R-Squared (R²): 0.4974


In [None]:
# Test Set Evaluation

In [20]:
# Drop engineered features 'cumulative_productivity' and 'rolling_productivity_mean' from training and validation sets
X_train_cleaned = X_train.drop(columns=['cumulative_productivity', 'rolling_productivity_mean'], errors='ignore')
X_val_cleaned = X_val.drop(columns=['cumulative_productivity', 'rolling_productivity_mean'], errors='ignore')

# Re-train the model using the cleaned training set
best_rf_model.fit(X_train_cleaned, y_train)

# Evaluate on the validation set
train_and_evaluate(best_rf_model, X_train_cleaned, y_train, X_val_cleaned, y_val)

# Drop engineered features from test set
X_test_cleaned = test_df.drop(columns=['cumulative_productivity', 'rolling_productivity_mean'], errors='ignore')

# Make predictions on the test set
test_predictions = best_rf_model.predict(X_test_cleaned)

# Print predictions
print("Predictions on the test set:")
print(test_predictions[:10])

Model: RandomForestRegressor
Mean Absolute Error (MAE): 0.0960
Mean Squared Error (MSE): 0.0229
R-Squared (R²): 0.3095
Predictions on the test set:
[0.74315308 0.87274447 0.70634275 0.76437778 0.67166801 0.84513905
 0.74299218 0.75217975 0.83599054 0.83850864]


In [None]:
# Interpretation of Predictions:
# Each value represents the predicted productivity for a worker based on the features provided in the test set.
# For example, the first worker's productivity is predicted to be 0.743, while the second worker's productivity is predicted to be 0.873.

# Overall Interpretation:
# The model performance on the validation set, as indicated by the MAE and R², shows that the model is able to capture some patterns in the data but could be improved further.
# The test set predictions are reasonable, but without the true actual_productivity values for the test set, it's difficult to definitively say how well the model performs on unseen data. If you had the actual test labels, you could calculate MAE, MSE, and R² for the test set as well.