In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib

# Load data
df = pd.read_csv("garments_worker_productivity.csv")

# Drop rows with null values (optional: can also fillna)
df.dropna(inplace=True)

# Encode target: convert actual_productivity to labels
def performance_label(x):
    if x < 0.6:
        return 'Low'
    elif x < 0.85:
        return 'Average'
    else:
        return 'High'

df['performance'] = df['actual_productivity'].apply(performance_label)
df.drop(['actual_productivity'], axis=1, inplace=True)

# Drop 'date' (not needed) and one-hot encode categorical features
df.drop(['date'], axis=1, errors='ignore', inplace=True)
df = pd.get_dummies(df, columns=['quarter', 'department', 'day'], drop_first=True)

# Separate features and labels
X = df.drop('performance', axis=1)
y = df['performance']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Balance classes using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y_encoded)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_res)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_res, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Classification Report:
               precision    recall  f1-score   support

     Average       0.96      0.92      0.94       100
        High       0.99      0.98      0.98        92
         Low       0.92      0.98      0.95        84

    accuracy                           0.96       276
   macro avg       0.96      0.96      0.96       276
weighted avg       0.96      0.96      0.96       276

Confusion Matrix:
 [[92  1  7]
 [ 2 90  0]
 [ 2  0 82]]


In [15]:
import pickle

# Save the trained model
with open("rf_employee_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save the label encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(y_encoded , f)

# Save the scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(X_scaled, f)
