In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 
from io import StringIO
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [36]:
data=pd.read_csv(r'practice_data.csv')
data.head(10)

Unnamed: 0,EmployeeID,gender,department,Days Present,Accuracy_of_Work,Meeting Project Deadlines,Task Completion Rates,Interpersonal Skills Rating,Decision-Making Skills Rating,Sales Revenue Generated,churned
0,1,Male,Sales,20,High,Yes,85,8,7,51638,No
1,2,Female,Marketing,22,Medium,Yes,90,7,6,97797,Yes
2,3,Male,Human Resources,25,High,No,80,9,8,95341,No
3,4,Male,Engineering,19,Low,Yes,75,6,5,91806,No
4,5,Female,Finance,23,Medium,Yes,85,7,7,64247,Yes
5,6,Male,Sales,21,High,Yes,95,8,8,89988,No
6,7,Female,Marketing,24,Low,No,70,5,4,70934,Yes
7,8,Male,Human Resources,18,Medium,Yes,80,6,6,96578,No
8,9,Male,Engineering,20,High,Yes,90,8,7,74219,No
9,10,Female,Finance,22,Medium,Yes,85,7,6,87009,Yes


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   EmployeeID                     31 non-null     int64 
 1   gender                         31 non-null     object
 2   department                     31 non-null     object
 3   Days Present                   31 non-null     int64 
 4   Accuracy_of_Work               31 non-null     object
 5   Meeting Project Deadlines      31 non-null     object
 6   Task Completion Rates          31 non-null     int64 
 7   Interpersonal Skills Rating    31 non-null     int64 
 8   Decision-Making Skills Rating  31 non-null     int64 
 9   Sales Revenue Generated        31 non-null     int64 
 10  churned                        31 non-null     object
dtypes: int64(6), object(5)
memory usage: 2.8+ KB


# LR Model

A Linear Regression (LR) model is one of the simplest and most widely used algorithms in machine learning and statistics.
Linear Regression is a supervised learning algorithm used for predicting a continuous value based on the relationship between input (independent) variables and an output (dependent) variable.
How It Works
Model Training: The algorithm finds the best-fitting line (or hyperplane) by minimizing the sum of squared errors between the actual and predicted values. This is done using methods like Ordinary Least Squares (OLS).

Prediction: Once trained, the model predicts values of 𝑦 given new values of 𝑥.



In [38]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [39]:
print(data.dtypes)

EmployeeID                        int64
gender                           object
department                       object
Days Present                      int64
Accuracy_of_Work                 object
Meeting Project Deadlines        object
Task Completion Rates             int64
Interpersonal Skills Rating       int64
Decision-Making Skills Rating     int64
Sales Revenue Generated           int64
churned                          object
dtype: object


In [43]:
# feature label Encoding 
# Encode binary string columns like 'Yes'/'No'

for col in data.select_dtypes(include='object').columns:
    if data[col].nunique() == 2:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    else:
        data = pd.get_dummies(data, columns=[col])

In [44]:
x = data.drop(['churned'],axis=1)
y = data['churned']
x.head(5)

Unnamed: 0,EmployeeID,gender,Days Present,Meeting Project Deadlines,Task Completion Rates,Interpersonal Skills Rating,Decision-Making Skills Rating,Sales Revenue Generated,department_Engineering,department_Finance,department_Human Resources,department_Marketing,department_Sales,Accuracy_of_Work_High,Accuracy_of_Work_Low,Accuracy_of_Work_Medium
0,1,1,20,1,85,8,7,51638,0,0,0,0,1,1,0,0
1,2,0,22,1,90,7,6,97797,0,0,0,1,0,0,0,1
2,3,1,25,0,80,9,8,95341,0,0,1,0,0,1,0,0
3,4,1,19,1,75,6,5,91806,1,0,0,0,0,0,1,0
4,5,0,23,1,85,7,7,64247,0,1,0,0,0,0,0,1


In [45]:
from sklearn.model_selection import train_test_split

In [46]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [54]:
model = LinearRegression() # create the LR Model 
model.fit(x_train, y_train) # train the LR model

y_pred = model.predict(x_test) # predict the LR model
mse = mean_squared_error(y_test, y_pred) # Evaluate the model
print("Mean squared error:", mse)


Mean squared error: 3.91904615606156e-24


------------
means the Mean Squared Error (MSE) is very close to zero — approximately: 0.000000000000000000000003919


MSE = 0 means perfect predictions —  model predicted the target exactly for all test samples.

A value this close to zero suggests that:

Your model fit the data almost perfectly, OR

There's something off (like data leakage), meaning the model saw or inferred target values it shouldn't have.

# Decision Tree Regression

Decision Tree Regression is a type of supervised learning algorithm used to predict continuous (numeric) target values.

It works by splitting the data into smaller and smaller subsets, based on feature values, and fits a simple average at each leaf (end) of the tree.

In [49]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [58]:
model = DecisionTreeRegressor()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

Mean squared error: 0.0


MSE = 0 means perfect predictions —  model predicted the target exactly for all test samples.

# Random Forest Regression

Random Forest Regression is an ensemble learning algorithm that combines the predictions of multiple decision trees to improve accuracy and reduce overfitting.

In [59]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [60]:
model = RandomForestRegressor()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

Mean squared error: 0.0


MSE = 0 means perfect predictions —  model predicted the target exactly for all test samples.