In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

In [71]:
df = pd.read_csv('Performance_Bonus.csv')

df = df.rename(columns={'Bouns': 'Bonus'})
df['Bonus'] = df['Bonus'].str.rstrip('%').astype(float) / 100
df['DaysOfAbsence'] = df['DaysOfAbsence'].clip(lower=0)
df.to_csv('Performance_Bonus_CLEANED.csv', index=False)
print(df.head())
print("\nData types:\n", df.dtypes)
print("\nNegative DaysOfAbsence (should be 0):", df[df['DaysOfAbsence'] < 0].shape[0])
print("Bonus range (should be 0-1):", df['Bonus'].min(), df['Bonus'].max())

   PerformanceID  EmploymentRating  DaysOfAbsence EducationLevel  \
0              1                10              2     Bachelor's   
1              2                10              0       Master's   
2              3                 4             10      Doctorate   
3              4                 9              0      Doctorate   
4              5                 2             13      Doctorate   

   CertificationsEarned  Bonus  
0                     5   0.33  
1                     5   0.31  
2                     4   0.34  
3                     3   0.27  
4                     1   0.37  

Data types:
 PerformanceID             int64
EmploymentRating          int64
DaysOfAbsence             int64
EducationLevel           object
CertificationsEarned      int64
Bonus                   float64
dtype: object

Negative DaysOfAbsence (should be 0): 0
Bonus range (should be 0-1): 0.0 0.4


In [73]:
df = pd.read_csv('Performance_Bonus_CLEANED.csv')

In [75]:
X = df[['EmploymentRating', 'DaysOfAbsence', 'EducationLevel', 'CertificationsEarned']]
y = df['Bonus']

In [77]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [79]:
# Preprocessing pipeline
categorical_features = ['EducationLevel']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)


In [81]:
# Create model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500, random_state=42))
])

In [83]:
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [85]:
df['Predicted_bonus'] = model.predict(X)

In [97]:
# Calculate prediction accuracy metrics
test_preds = model.predict(X_test)
mae = mean_absolute_error(y_test, test_preds)
print(f"\nModel Performance (Mean Absolute Error): {mae:.4f}")


Model Performance (Mean Absolute Error): 0.1103


In [89]:
it_df = pd.read_csv("ITcleaned1.csv")
X_it = it_df[['EmploymentRating', 'DaysOfAbsence', 'EducationLevel', 'CertificationsEarned']]
it_df['Predicted_bonus'] = model.predict(X_it)

In [101]:
if 'Bonus %' in it_df.columns:
    underpaid_count = (it_df['Bonus %'] < it_df['Predicted_bonus']).sum()
    overpaid_count = (it_df['Bonus %'] > it_df['Predicted_bonus']).sum()
    exact_match = (it_df['Bonus %'] == it_df['Predicted_bonus']).sum()

    print(f"\nEmployees receiving less bonus than predicted: {underpaid_count}")
    print(f"Employees receiving more bonus than predicted: {overpaid_count}")
    print(f"Employees with exact match: {exact_match}")



Employees receiving less bonus than predicted: 205
Employees receiving more bonus than predicted: 26
Employees with exact match: 0


In [105]:
it_df.to_csv('IT_Performance_Bonus_with_Predictions.csv', index=False)