In [None]:
# Basic Libraries
import numpy as np
import pandas as pd

# Date and Time
from datetime import datetime

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Metrics
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
train_df = pd.read_csv('train.csv')

In [None]:
# Handle missing values. Let's drop rows with any missing value for now.
train_df = train_df.dropna()

In [None]:
# Convert 'Date of Joining' to datetime format
train_df['Date of Joining'] = pd.to_datetime(train_df['Date of Joining'])

# Add 'Tenure' feature as number of days from 'Date of Joining' to today
train_df['Tenure'] = (pd.Timestamp('today') - train_df['Date of Joining']).dt.days

In [None]:
# One-hot encode categorical features
train_df = pd.get_dummies(train_df, columns=['Gender', 'Company Type', 'WFH Setup Available'])

In [None]:
# Add new feature 'Resource Allocation per Designation'
train_df['Resource Allocation per Designation'] = train_df['Resource Allocation'] / train_df['Designation']

In [None]:
# Handle possible infinities resulting from above division
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
# Handle these NaNs by replacing them with zeros
train_df.fillna(0, inplace=True)

print(train_df.head())

                Employee ID Date of Joining  Designation  Resource Allocation  \
0  fffe32003000360033003200      2008-09-30          2.0                  3.0   
1      fffe3700360033003500      2008-11-30          1.0                  2.0   
3  fffe32003400380032003900      2008-11-03          1.0                  1.0   
4  fffe31003900340031003600      2008-07-24          3.0                  7.0   
5      fffe3300350037003500      2008-11-26          2.0                  4.0   

   Mental Fatigue Score  Burn Rate  Tenure  Gender_Female  Gender_Male  \
0                   3.8       0.16    5362              1            0   
1                   5.0       0.36    5301              0            1   
3                   2.6       0.20    5328              0            1   
4                   6.9       0.52    5430              1            0   
5                   3.6       0.29    5305              0            1   

   Company Type_Product  Company Type_Service  WFH Setup Available_N

Linear Regression Model

In [None]:
# Define your target variable
y = train_df['Burn Rate']

# Define your features
X = train_df.drop(['Burn Rate', 'Employee ID', 'Date of Joining'], axis=1)

# Split the data into a training set and a validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train,y_train)

In [None]:
# Make predictions on the validation set
y_pred = model.predict(X_test)

# Calculate and print the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

# Calculate and print the R-squared value
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)

Mean Squared Error:  0.003047403667390327
R-squared:  0.9206504200872733


Random Forest Model

In [None]:
# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions on the validation set
rf_y_pred = rf_model.predict(X_train)

# Calculate and print the Mean Squared Error
rf_mse = mean_squared_error(y_train, rf_y_pred)
print("Mean Squared Error: ", rf_mse)

# Calculate and print the R-squared value
rf_r2 = r2_score(y_train, rf_y_pred)
print("R-squared: ", rf_r2)

Mean Squared Error:  0.0004561940164037956
R-squared:  0.9883993452829354


Gradient Boosting Regressor Model

In [None]:
# Initialize the model
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

In [None]:
# Make predictions on the validation set
gb_y_pred = gb_model.predict(X_train)

# Calculate and print the Mean Squared Error
gb_mse = mean_squared_error(y_train, gb_y_pred)
print("Mean Squared Error: ", gb_mse)

# Calculate and print the R-squared value
gb_r2 = r2_score(y_train, gb_y_pred)
print("R-squared: ", gb_r2)

Mean Squared Error:  0.002790324589941464
R-squared:  0.929044242246715


Decision Tree Regressor Model

In [None]:
# Initialize the model
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

In [None]:
# Make predictions on the validation set
dt_y_pred = dt_model.predict(X_train)

# Calculate and print the Mean Squared Error
dt_mse = mean_squared_error(y_train, dt_y_pred)
print("Mean Squared Error: ", dt_mse)

# Calculate and print the R-squared value
dt_r2 = r2_score(y_train, dt_y_pred)
print("R-squared: ", dt_r2)

Mean Squared Error:  1.315335305719921e-05
R-squared:  0.9996655205861947
